python不同语言的字符串连接成文本
# -*- coding:utf-8 -*- # import sys import unicodedata import six _ALPHANUMERIC_CHAR_SET = set( six.unichr(i) for i in xrange(sys.maxunicode) if (unicodedata.category(six.unichr(i)).startswith("L") or unicodedata.category(six.unichr(i)).startswith("N"))) def _join_tokens_to_string(tokens): """Join a list of string tokens into a single string.""" token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens] ret = [] for i, token in enumerate(tokens): if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]: ret.append(u" ") token = token.decode("utf-8") ret.append(token) return "".join(ret) if __name__ == '__main__': texts = [['hello','world'], ['mehr', 'Sicherheit', 'für'], ["从40万年前","开始"]] for text in texts: ret = _join_tokens_to_string(text) print(ret.encode("utf-8"))
输出结果:
hello world
mehr Sicherheit für
从40万年前开始