Python 文本挖掘:解决Python中文编码问题
转于:http://rzcoding.blog.163.com/blog/static/2222810172013101785738166/
#! /usr/bin/env python2.7
#coding=utf-8
# 在所有程序开头都声明使用的Python 版本和编码
sometext.decode("utf8")# 成为一个unicode编码
sometext.encode("utf8")# 重新编码成utf-8编码
str(sometext) # 用str()函数也可把unicode编码为utf-8,因为一开始已经声明了编码类型为utf-8
#! /usr/bin/env python2.7
#coding=utf-8
token =",.!?:;"
print type(token)
>><type 'str'>
print type(token.decode('utf-8'))
>><type 'unicode'>
由于在开头声明了所有字符串类型的编码都为utf-8,所以“<type 'str'>”的意思是该字符是字符串类型,所以编码为utf-8。而它在decode 之后可以看出它已经成为unicode 编码了。#! /usr/bin/env python2.7
#coding=utf-8
token =",。!?:;"
for t in token:
print t
>>UnicodeDecodeError:'ascii' codec can't decode byte balabala in position balabala : ordinal not in range
for t in token.decode('utf-8'):
print t,
>> , 。 ! ? : ;
把字符串类型的token 解码为unicode 之后,就可以被循环遍历了。#! /usr/bin/env python2.7
#coding=utf-8
token =",。!?:;".decode("utf8")
string_list =['我','是','一只','大','苹果',',','又','香','又','甜','又','好吃','。']
for t in token:
for sl in string_list:
if t == sl:#无法匹配,因为一个是unicode编码,一个是utf-8编码
print t,
>>
for t in token:
for sl in string_list:
if t == sl.decode("utf8") #这时两个都是unicode编码了,可以匹配了
print t,
>> , 。
由于string_list 里面都是utf-8编码的元素,因此在匹配的时候需要解码成unicode 才能和已经解码的token 匹配。#! /usr/bin/env python2.7
#coding=utf-8
import jieba
string ="我是一只大苹果,又香又甜又好吃。"
string_list =[]
seg = jieba.cut(string)
for word in seg:
string_list.append(word)
print string_list
>>[u'u6211', u'u662f', u'u4e00u53ea', u'u5927', u'u82f9u679c', u'uff0c', u'u53c8', u'u9999', u'u53c8', u'u751c', u'u53c8', u'u597du5403', u'u3002']
② 去掉标点符号#! /usr/bin/env python2.7
#coding=utf-8
import jieba
string ="我是一只大苹果,又香又甜又好吃。"
string_list =[]
seg = jieba.cut(string)
for i in seg:
string_list.append(i)
token =",。!?:;".decode('utf-8')
filer_seg =[fil for fil in seg if fil notin token]# 用Python的列表推导来去掉标点,形成一个去掉标点后的数组
for word in filter_seg:
print word,
>> 我 是 一只 大 苹果 又 香 又 甜 又 好吃
#! /usr/bin/env python2.7
#coding=utf-8
import jieba
string ="我是一只大苹果,又香又甜又好吃。"
string_list =[]
seg = jieba.cut(string)
for i in seg:
string_list.append(i)
f = open('D:/code/example.txt','w')
for word in string_list:
f.write(str(word)+' ')#用str()函数把unicode字符编码为utf-8,并写入txt 文件中
f.close()