利用朴素贝叶斯根据名字判断性别

import pandas as pd
from collections import defaultdict
import math


train=pd.read_csv('train.txt')
test=pd.read_csv('test.txt')
submit=pd.read_csv('sample_submit.csv')

train.head(10)

#%%
# 把数据分为男女两部分
names_female = train[train['gender'] == 0]
names_male = train[train['gender'] == 1]

# totals用来存放训练集中女生、男生的总数
totals = {'f': len(names_female),
          'm': len(names_male)}

frequency_list_f = defaultdict(int)
for name in names_female['name']:
    for char in name:
        frequency_list_f[char] += 1. / totals['f']

frequency_list_m = defaultdict(int)
for name in names_male['name']:
    for char in name:
        frequency_list_m[char] += 1. / totals['m']
print(frequency_list_f['娟'])
print(frequency_list_m['钢'])

#%%
def LaplaceSmooth(char, frequency_list, total, alpha=1.0):
    count = frequency_list[char] * total
    distinct_chars = len(frequency_list)
    freq_smooth = (count + alpha ) / (total + distinct_chars * alpha)
    return freq_smooth

#%%
base_f = math.log(1 - train['gender'].mean())
base_f += sum([math.log(1 - frequency_list_f[char]) for char in frequency_list_f])

base_m = math.log(train['gender'].mean())
base_m += sum([math.log(1 - frequency_list_m[char]) for char in frequency_list_m])

bases = {'f': base_f, 'm': base_m}
#%%
def GetLogProb(char, frequency_list, total):
    freq_smooth = LaplaceSmooth(char, frequency_list, total)
    return math.log(freq_smooth) - math.log(1 - freq_smooth)
#%%
def ComputeLogProb(name, bases, totals, frequency_list_m, frequency_list_f):
    logprob_m = bases['m']
    logprob_f = bases['f']
    for char in name:
        logprob_m += GetLogProb(char, frequency_list_m, totals['m'])
        logprob_f += GetLogProb(char, frequency_list_f, totals['f'])
    return {'male': logprob_m, 'female': logprob_f}

def GetGender(LogProbs):
    return LogProbs['male'] > LogProbs['female']

result = []
for name in test['name']:
    LogProbs = ComputeLogProb(name, bases, totals, frequency_list_m, frequency_list_f)
    gender = GetGender(LogProbs)
    result.append(int(gender))

submit['gender'] = result

submit.to_csv('my_NB_prediction.csv', index=False)

相关阅读:
尝试用微博记录 SQL Server 2012开发者训练营笔记
 Contact Manager Web API 示例[4] 异常处理(Exception Handling)
使用IAPIExplorer列举ASP.NET Web API
从 WebAPI Beta 更新到WebAPI RC
Tracing in ASP.NET Web API
一个基于asp.net2.0空间的webproxy程序：ASProxy
微软发布平板电脑 – Surface
Big Data, Hadoop and StreamInsight™
REST 入门介绍
 通过企业分布式缓存共享运行时数据
原文地址：https://www.cnblogs.com/liujinxin123/p/12499636.html