>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter >>> segmenter = StanfordSegmenter(path_to_jar='stanford-segmenter-3.8.0.jar', path_to_sihan_corpora_dict='./data', path_to_model='./data/pku.gz', path_to_dict='./data/dict-chris6.ser.gz') >>> sentence = u'这是斯坦福中文分词器测试' >>> segmenter.segment(sentence) u'u8fd9 u662f u65afu5766u798f u4e2du6587 u5206u8bcdu5668 u6d4bu8bd5 ' >>> segmenter.segment_file('test.simp.utf8') u'u9762u5bf9 u65b0 u4e16u7eaa uff0c u4e16u754c u5404u56fd u4ebau6c11 u7684 u5171u540c u613fu671b u662f uff1a u7ee7u7eed u53d1u5c55 u4ebau7c7b u4ee5u5f80 u521bu9020 u7684 u4e00u5207 u6587u660e u6210u679c uff0c u514bu670d 20 u4e16u7eaa u56f0u6270 u7740 u4ebau7c7b u7684 u6218u4e89 u548c u8d2bu56f0 u95eeu9898 uff0c u63a8u8fdb u548cu5e73 u4e0e u53d1u5c55 u7684 u5d07u9ad8 u4e8bu4e1a uff0c u521bu9020 u4e00u4e2a u7f8eu597d u7684 u4e16u754c u3002 ' >>> outfile = open('outfile', 'w') >>> result = segmenter.segment(sentence) >>> outfile.write(result.encode('UTF-8')) >>> outfile.close()