• 如何高效的完成中文分词?


    在说分词之前,笔者先来介绍下何为分词:分词就是将连续的字序列按照一定的规范重新组合成词序列的过程。英文中,单词之间是以空格作为自然分界符的,但是中文的分词就复杂多了,要涉及一些算法,对于初学者来说,还是有很多难度的。这里笔者只介绍一种最简单的方式,有兴趣的朋友可以看下,直接上代码,python实现方式

    # -*- coding: utf-8 -*-
    # flake8: noqa
    __author__ = 'wukong'
     
    import urllib
    from urllib import urlencode
     
    #配置您申请的appKey和openId
    app_key="***"
    open_id="***"
     
    """
    request_url 请求地址
    params 请求参数
    method 请求方法
    """
    def request_content(request_url,params,method):
        params = urlencode(params)
        
        if method and method.lower() =="get":
            f = urllib.urlopen("%s?%s" % (request_url, params))
        else:
            f = urllib.urlopen(request_url, params)
     
        content = f.read()
        print content
     
       
    def main():
        
        domain="http://api.xiaocongjisuan.com/"
        servlet="data/chinesekeyword/analysis"
        method="get"
        request_url=domain+servlet
        
        #字典
        params ={}
        params["appKey"]=app_key
        params["openId"]=open_id
        
        #变动部分
        params["content"]="我是一个中国人,你知道嘛"
        
        request_content(request_url,params,method)
        
    if __name__ == '__main__':
        main()
    

    java 为例:

    package com.xiaocongjisuan.module.example;
     
    import java.io.BufferedReader;
    import java.io.DataOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.UnsupportedEncodingException;
    import java.net.HttpURLConnection;
    import java.net.URL;
    import java.net.URLEncoder;
    import java.util.HashMap;
    import java.util.Map;
     
    public class Application {
        
         public static final String DEF_CHATSET = "UTF-8";
         public static final int DEF_CONN_TIMEOUT = 30000;
         public static final int DEF_READ_TIMEOUT = 30000;
         public static String userAgent =  "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36";
         
         //配置您申请的appKey和openId
         public static final String APP_KEY ="yours";
         public static final String OPEN_ID ="yours";
         
         //将map型转为请求参数型
         public static String urlEncode(Map<String,Object> params) {
            
            if(params==null){return "";};
             
            StringBuilder sb = new StringBuilder();
            for (Map.Entry<String,Object> i : params.entrySet()) {
                try {
                    sb.append(i.getKey()).append("=").append(URLEncoder.encode(i.getValue()+"","UTF-8")).append("&");
                } catch (UnsupportedEncodingException e) {
                    e.printStackTrace();
                }
            }
            String r=sb.toString();
            if(r.endsWith("&")){
                r = r.substring(0,r.length()-1);
            }
            return r;
         }
         
         /**
         *
         * @param requestUrl 请求地址
         * @param params 请求参数
         * @param method 请求方法
         * @return 请求结果
         * @throws Exception
         */
         public static String requestContent(String requestUrl, Map<String,Object> params,String method) throws Exception {
            
            HttpURLConnection conn = null;
            BufferedReader reader = null;
            String rs = null;
            try {
     
                //组装请求链接
                StringBuffer sb = new StringBuffer();
                
                if(method!=null&&method.equalsIgnoreCase("get")){
                    requestUrl = requestUrl+"?"+urlEncode(params);
                }
     
                //默认get
                URL url = new URL(requestUrl);
                conn = (HttpURLConnection) url.openConnection();
                conn.setRequestMethod("GET");
                
                if(method!=null&&method.equalsIgnoreCase("post")){
                     conn.setRequestMethod("POST");
                     conn.setDoOutput(true);
                     conn.setDoInput(true);
                }
     
                //参数配置
                conn.setRequestProperty("User-agent", userAgent);
                conn.setUseCaches(false);
                conn.setConnectTimeout(DEF_CONN_TIMEOUT);
                conn.setReadTimeout(DEF_READ_TIMEOUT);
                conn.setInstanceFollowRedirects(false);
                conn.connect();
                
                if (params!= null && method.equalsIgnoreCase("post")) {
                    try {
                        DataOutputStream out = new DataOutputStream(conn.getOutputStream());
                        out.writeBytes(urlEncode(params));
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
                
                //读取数据
                InputStream is = conn.getInputStream();
                reader = new BufferedReader(new InputStreamReader(is, DEF_CHATSET));
                String strRead = null;
                while ((strRead = reader.readLine()) != null) {
                    sb.append(strRead);
                }
                rs = sb.toString();
                
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                if (reader != null) {
                    reader.close();
                }
                if (conn != null) {
                    conn.disconnect();
                }
            }
            return rs;
        }
        
        
        public static void main(String[] args) throws Exception{
            
            String domain="http://api.xiaocongjisuan.com/";
            String servlet="data/skydriverdata/get";
            String method="get";
            
            String requestUrl=domain+servlet;
            Map<String,Object> params=new HashMap<String,Object>();
            params.put("appKey",APP_KEY);
            params.put("openId",OPEN_ID);
            
            //变动部分
            params.put("q","a");
            params.put("currentPage",1);
            params.put("pageSize",20);
            
            
            String result=requestContent(requestUrl,params,method);
            System.out.println(result);
        }
    }

    原理主要是调用接口,直接输入一串字符串,然后接口会自动把结果以json或者xml的形式返回,具体文档可以点我查看。这种实现方式很简单,省去了大量的开发时间,屏蔽了语言之间的差异性,值得推荐。

  • 相关阅读:
    Oracle 常用的十大 DDL 对象
    Oracle DML
    Oracle 的常用概念
    Spring 4 : 整合 SSH
    Spring3 (事务管理)
    Spring2
    Spring 学习笔记一
    Xpath helper下载
    爬取链家北京市二手房的单个房源页信息
    爬取链家北京市二手房的链家编号
  • 原文地址:https://www.cnblogs.com/huangxie/p/11637417.html
Copyright © 2020-2023  润新知