• 关键词提取1-C#


     

    C# 中文分词算法(实现从文章中提取关键字算法)

    using System;
    using System.IO;
    using System.Text;
    using System.Collections;
    using System.Collections.Generic;
    using System.Text.RegularExpressions;
    namespace LumkitCms.Utils
    {
        /// <summary>
        /// 分词类
        /// </summary>
        public static class WordSpliter
        {
            #region 属性
            private static string SplitChar = " ";//分隔符
            #endregion
            //
            #region 数据缓存函数
            /// <summary>
            /// 数据缓存函数
            /// </summary>
            /// <param name="key">索引键</param>
            /// <param name="val">缓存的数据</param>
            private static void SetCache(string key, object val)
            {
                if (val == null)
                    val = " ";
                System.Web.HttpContext.Current.Application.Lock();
                System.Web.HttpContext.Current.Application.Set(key, val);
                System.Web.HttpContext.Current.Application.UnLock();
            }
            /// <summary>
            /// 读取缓存
            /// </summary>
            /// <param name="mykey"></param>
            /// <returns></returns>
            private static object GetCache(string key)
            {
                return System.Web.HttpContext.Current.Application.Get(key);
            }
            #endregion
            //
            #region 读取文本
            private static SortedList ReadTxtFile(string FilePath)
            {
                if (GetCache("cms_dict") == null)
                {
                    Encoding encoding = Encoding.GetEncoding("utf-8");
                    SortedList arrText = new SortedList();
                    //
                    try
                    {
                        FilePath = System.Web.HttpContext.Current.Server.MapPath(FilePath);
                        if (!File.Exists(FilePath))
                        {
                            arrText.Add("0", "文件" + FilePath + "不存在...");
                        }
                        else
                        {
                            StreamReader objReader = new StreamReader(FilePath, encoding);
                            string sLine = "";
                            //ArrayList arrText = new ArrayList();

                            while (sLine != null)
                            {
                                sLine = objReader.ReadLine();
                                if (sLine != null)
                                    arrText.Add(sLine, sLine);
                            }
                            //
                            objReader.Close();
                            objReader.Dispose();
                        }
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                    SetCache("cms_dict", arrText);
                }
                return (SortedList)GetCache("cms_dict");
            }
            #endregion
            //
            #region 载入词典
            private static SortedList LoadDict(string dictfile)
            {
                return ReadTxtFile(dictfile);
            }
            #endregion
            //
            #region 判断某字符串是否在指定字符数组中
            private static bool StrIsInArray(string[] StrArray, string val)
            {
                for (int i = 0; i < StrArray.Length; i++)
                    if (StrArray[i] == val) return true;
                return false;
            }
            #endregion
            //
            #region 正则检测
            private static bool IsMatch(string str, string reg)
            {
                return new Regex(reg).IsMatch(str);
            }
            #endregion
            //
            #region 首先格式化字符串(粗分)
            private static string FormatStr(string val)
            {
                string result = "";
                if (val == null || val == "")
                    return "";
                //
                char[] CharList = val.ToCharArray();
                //
                string Spc = SplitChar;//分隔符
                int StrLen = CharList.Length;
                int CharType = 0; //0-空白 1-英文 2-中文 3-符号
                //
                for (int i = 0; i < StrLen; i++)
                {
                    string StrList = CharList[i].ToString();
                    if (StrList == null || StrList == "")
                        continue;
                    //
                    if (CharList[i] < 0x81)
                    {
                        #region
                        if (CharList[i] < 33)
                        {
                            if (CharType != 0 && StrList != "/n" && StrList != "/r")
                            {
                                result += " ";
                                CharType = 0;
                            }
                            continue;
                        }
                        else if (IsMatch(StrList, "[^0-9a-zA-Z@//.%#:///&_-]"))//排除这些字符
                        {
                            if (CharType == 0)
                                result += StrList;
                            else
                                result += Spc + StrList;
                            CharType = 3;
                        }
                        else
                        {
                            if (CharType == 2 || CharType == 3)
                            {
                                result += Spc + StrList;
                                CharType = 1;
                            }
                            else
                            {
                                if (IsMatch(StrList, "[@%#:]"))
                                {
                                    result += StrList;
                                    CharType = 3;
                                }
                                else
                                {
                                    result += StrList;
                                    CharType = 1;
                                }//end if No.4
                            }//end if No.3
                        }//end if No.2
                        #endregion
                    }//if No.1
                    else
                    {
                        //如果上一个字符为非中文和非空格,则加一个空格
                        if (CharType != 0 && CharType != 2)
                            result += Spc;
                        //如果是中文标点符号
                        if (!IsMatch(StrList, "^[/u4e00-/u9fa5]+$"))
                        {
                            if (CharType != 0)
                                result += Spc + StrList;
                            else
                                result += StrList;
                            CharType = 3;
                        }
                        else //中文
                        {
                            result += StrList;
                            CharType = 2;
                        }
                    }
                    //end if No.1

                }//exit for
                //
                return result;
            }
            #endregion
            //
            #region 分词
            /// <summary>
            /// 分词
            /// </summary>
            /// <param name="key">关键词</param>
            /// <returns></returns>
            private static ArrayList StringSpliter(string[] key, string dictfile)
            {
                ArrayList List = new ArrayList();
                try
                {
                    SortedList dict = LoadDict(dictfile);//载入词典
                    //
                    for (int i = 0; i < key.Length; i++)
                    {
                        if (IsMatch(key[i], @"^(?!^/.$)([a-zA-Z0-9/./u4e00-/u9fa5]+)$")) //中文、英文、数字
                        {
                            if (IsMatch(key[i], "^[/u4e00-/u9fa5]+$"))//如果是纯中文
                            {
                                int keyLen = key[i].Length;
                                if (keyLen < 2)
                                    continue;
                                else if (keyLen <= 7)
                                    List.Add(key[i]);
                                //开始分词
                                for (int x = 0; x < keyLen; x++)
                                {
                                    //x:起始位置//y:结束位置
                                    for (int y = x; y < keyLen; y++)
                                    {
                                        string val = key[i].Substring(x, keyLen - y);
                                        if (val == null || val.Length < 2)
                                            break;
                                        else if (val.Length > 10)
                                            continue;
                                        if (dict.Contains(val))
                                            List.Add(val);
                                    }
                                }
                            }
                            else if (!IsMatch(key[i], @"^(/.*)$"))//不全是小数点
                            {
                                List.Add(key[i]);
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                    throw ex;
                }
                return List;
            }
            #endregion
            //
            #region 得到分词结果
            /// <summary>
            /// 得到分词结果
            /// </summary>
            /// <param name="key"></param>
            /// <returns></returns>
            public static string[] DoSplit(string key, string dictfile)
            {
                ArrayList KeyList = StringSpliter(FormatStr(key).Split(SplitChar.ToCharArray()), dictfile);
                KeyList.Insert(0, key);
                //去掉重复的关键词
                for (int i = 0; i < KeyList.Count; i++)
                {
                    for (int j = 0; j < KeyList.Count; j++)
                    {
                        if (KeyList[i].ToString() == KeyList[j].ToString())
                        {
                            if (i != j)
                            {
                                KeyList.RemoveAt(j); j--;
                            }
                        }
                    }
                }
                return (string[])KeyList.ToArray(typeof(string));
            }
            /// <summary>
            /// 得到分词关键字,以逗号隔开
            /// </summary>
            /// <param name="key"></param>
            /// <returns></returns>
            public static string GetKeyword(string key, string dictfile)
            {
                string _value = "";
                string[] _key = DoSplit(key, dictfile);
                for (int i = 1; i < _key.Length; i++)
                {
                    if (i == 1)
                        _value = _key[i].Trim();
                    else
                        _value += "," + _key[i].Trim();
                }
                return _value;
            }
            #endregion
        }
    }

  • 相关阅读:
    jquery 回车切换 tab功能
    Jtemplates 基本语法
    Rdlc报表出现空白页解决方法
    动软代码生成与 EntityFramework 实体生成模板
    windows 无法启动asp.net 状态服务 错误 0x8007277a
    导出Excel Gridview
    错误提示:类型“GridView”的控件“GridView1”必须放在具有 runat=server 的窗体标记内 .
    c#与vb.net在App_Code里面编译要通过,需要以下web.config的配置
    個人最近做的最多的重複工作就是excel导出
    Js/Jquery获取iframe中的元素 在Iframe中获取父窗体的元素方法
  • 原文地址:https://www.cnblogs.com/Lxiaojiang/p/3594665.html
Copyright © 2020-2023  润新知