• java过滤emoji表情(成功率高)


    转载自:http://blog.csdn.net/huangchao064/article/details/53283738

    基本能过滤大部分的ios,安卓,微信emoji表情

    有很多别的帖子搜出来很多有bug,有些表情是不能过滤的

    类似于这个判断,过滤成功率有点低

    private static boolean isEmojiCharacter(char codePoint) {//不能完整判断
    	        return (codePoint == 0x0) ||
    	                (codePoint == 0x9) ||
    	                (codePoint == 0xA) ||
    	                (codePoint == 0xD) ||
    	                ((codePoint >= 0x20) && (codePoint <= 0xD7FF)) ||
    	                ((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) ||
    	                ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF));
    	    }
    

    *********************下面的方法过滤率就比较高了*********************

    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.commons.lang.StringUtils;
    /**
     * 
     * @ClassName: EmojiUtils
     * @Description:过滤emoji表情
     * @author: Simon
     * @date: 2017年9月22日 上午11:11:00
     */
    public class EmojiUtils {
    	public final static String unicodeReg = "[" + "u4E00-u9FBF" + // :CJK
    																	// 统一表意符号
    																	// (CJK
    																	// Unified
    																	// Ideographs)
    			"u4DC0-u4DFF" + // :易经六十四卦符号 (Yijing Hexagrams Symbols)
    			"u0000-u007F" + // :C0控制符及基本拉丁文 (C0 Control and Basic Latin)
    			"u0080-u00FF" + // :C1控制符及拉丁:补充-1 (C1 Control and Latin 1
    								// Supplement)
    			"u0100-u017F" + // :拉丁文扩展-A (Latin Extended-A)
    			"u0180-u024F" + // :拉丁文扩展-B (Latin Extended-B)
    			"u0250-u02AF" + // :国际音标扩展 (IPA Extensions)
    			"u02B0-u02FF" + // :空白修饰字母 (Spacing Modifiers)
    			"u0300-u036F" + // :结合用读音符号 (Combining Diacritics Marks)
    			"u0370-u03FF" + // :希腊文及科普特文 (Greek and Coptic)
    			"u0400-u04FF" + // :西里尔字母 (Cyrillic)
    			"u0500-u052F" + // :西里尔字母补充 (Cyrillic Supplement)
    			"u0530-u058F" + // :亚美尼亚语 (Armenian)
    			"u0590-u05FF" + // :希伯来文 (Hebrew)
    			"u0600-u06FF" + // :阿拉伯文 (Arabic)
    			"u0700-u074F" + // :叙利亚文 (Syriac)
    			"u0750-u077F" + // :阿拉伯文补充 (Arabic Supplement)
    			"u0780-u07BF" + // :马尔代夫语 (Thaana)
    			// "u07C0-u077F"+//:西非书面语言 (N'Ko)
    			"u0800-u085F" + // :阿维斯塔语及巴列维语 (Avestan and Pahlavi)
    			"u0860-u087F" + // :Mandaic
    			"u0880-u08AF" + // :撒马利亚语 (Samaritan)
    			"u0900-u097F" + // :天城文书 (Devanagari)
    			"u0980-u09FF" + // :孟加拉语 (Bengali)
    			"u0A00-u0A7F" + // :锡克教文 (Gurmukhi)
    			"u0A80-u0AFF" + // :古吉拉特文 (Gujarati)
    			"u0B00-u0B7F" + // :奥里亚文 (Oriya)
    			"u0B80-u0BFF" + // :泰米尔文 (Tamil)
    			"u0C00-u0C7F" + // :泰卢固文 (Telugu)
    			"u0C80-u0CFF" + // :卡纳达文 (Kannada)
    			"u0D00-u0D7F" + // :德拉维族语 (Malayalam)
    			"u0D80-u0DFF" + // :僧伽罗语 (Sinhala)
    			"u0E00-u0E7F" + // :泰文 (Thai)
    			"u0E80-u0EFF" + // :老挝文 (Lao)
    			"u0F00-u0FFF" + // :藏文 (Tibetan)
    			"u1000-u109F" + // :缅甸语 (Myanmar)
    			"u10A0-u10FF" + // :格鲁吉亚语 (Georgian)
    			"u1100-u11FF" + // :朝鲜文 (Hangul Jamo)
    			"u1200-u137F" + // :埃塞俄比亚语 (Ethiopic)
    			"u1380-u139F" + // :埃塞俄比亚语补充 (Ethiopic Supplement)
    			"u13A0-u13FF" + // :切罗基语 (Cherokee)
    			"u1400-u167F" + // :统一加拿大土著语音节 (Unified Canadian Aboriginal
    								// Syllabics)
    			"u1680-u169F" + // :欧甘字母 (Ogham)
    			"u16A0-u16FF" + // :如尼文 (Runic)
    			"u1700-u171F" + // :塔加拉语 (Tagalog)
    			"u1720-u173F" + // :Hanunóo
    			"u1740-u175F" + // :Buhid
    			"u1760-u177F" + // :Tagbanwa
    			"u1780-u17FF" + // :高棉语 (Khmer)
    			"u1800-u18AF" + // :蒙古文 (Mongolian)
    			"u18B0-u18FF" + // :Cham
    			"u1900-u194F" + // :Limbu
    			"u1950-u197F" + // :德宏泰语 (Tai Le)
    			"u1980-u19DF" + // :新傣仂语 (New Tai Lue)
    			"u19E0-u19FF" + // :高棉语记号 (Kmer Symbols)
    			"u1A00-u1A1F" + // :Buginese
    			"u1A20-u1A5F" + // :Batak
    			"u1A80-u1AEF" + // :Lanna
    			"u1B00-u1B7F" + // :巴厘语 (Balinese)
    			"u1B80-u1BB0" + // :巽他语 (Sundanese)
    			"u1BC0-u1BFF" + // :Pahawh Hmong
    			"u1C00-u1C4F" + // :雷布查语(Lepcha)
    			"u1C50-u1C7F" + // :Ol Chiki
    			"u1C80-u1CDF" + // :曼尼普尔语 (Meithei/Manipuri)
    			"u1D00-u1D7F" + // :语音学扩展 (Phone tic Extensions)
    			"u1D80-u1DBF" + // :语音学扩展补充 (Phonetic Extensions Supplement)
    			"u1DC0-u1DFF" + // 结合用读音符号补充 (Combining Diacritics Marks
    								// Supplement)
    			"u1E00-u1EFF" + // :拉丁文扩充附加 (Latin Extended Additional)
    			"u1F00-u1FFF" + // :希腊语扩充 (Greek Extended)
    			"u2000-u206F" + // :常用标点 (General Punctuation)
    			"u2070-u209F" + // :上标及下标 (Superscripts and Subscripts)
    			"u20A0-u20CF" + // :货币符号 (Currency Symbols)
    			"u20D0-u20FF" + // :组合用记号 (Combining Diacritics Marks for Symbols)
    			"u2100-u214F" + // :字母式符号 (Letterlike Symbols)
    			"u2150-u218F" + // :数字形式 (Number Form)
    			"u2190-u21FF" + // :箭头 (Arrows)
    			"u2200-u22FF" + // :数学运算符 (Mathematical Operator)
    			"u2300-u23FF" + // :杂项工业符号 (Miscellaneous Technical)
    			"u2400-u243F" + // :控制图片 (Control Pictures)
    			"u2440-u245F" + // :光学识别符 (Optical Character Recognition)
    			"u2460-u24FF" + // :封闭式字母数字 (Enclosed Alphanumerics)
    			"u2500-u257F" + // :制表符 (Box Drawing)
    			"u2580-u259F" + // :方块元素 (Block Element)
    			"u25A0-u25FF" + // :几何图形 (Geometric Shapes)
    			"u2600-u26FF" + // :杂项符号 (Miscellaneous Symbols)
    			"u2700-u27BF" + // :印刷符号 (Dingbats)
    			"u27C0-u27EF" + // :杂项数学符号-A (Miscellaneous Mathematical
    								// Symbols-A)
    			"u27F0-u27FF" + // :追加箭头-A (Supplemental Arrows-A)
    			"u2800-u28FF" + // :盲文点字模型 (Braille Patterns)
    			"u2900-u297F" + // :追加箭头-B (Supplemental Arrows-B)
    			"u2980-u29FF" + // :杂项数学符号-B (Miscellaneous Mathematical
    								// Symbols-B)
    			"u2A00-u2AFF" + // :追加数学运算符 (Supplemental Mathematical Operator)
    			"u2B00-u2BFF" + // :杂项符号和箭头 (Miscellaneous Symbols and Arrows)
    			"u2C00-u2C5F" + // :格拉哥里字母 (Glagolitic)
    			"u2C60-u2C7F" + // :拉丁文扩展-C (Latin Extended-C)
    			"u2C80-u2CFF" + // :古埃及语 (Coptic)
    			"u2D00-u2D2F" + // :格鲁吉亚语补充 (Georgian Supplement)
    			"u2D30-u2D7F" + // :提非纳文 (Tifinagh)
    			"u2D80-u2DDF" + // :埃塞俄比亚语扩展 (Ethiopic Extended)
    			"u2E00-u2E7F" + // :追加标点 (Supplemental Punctuation)
    			"u2E80-u2EFF" + // :CJK 部首补充 (CJK Radicals Supplement)
    			"u2F00-u2FDF" + // :康熙字典部首 (Kangxi Radicals)
    			"u2FF0-u2FFF" + // :表意文字描述符 (Ideographic Description Characters)
    			"u3000-u303F" + // :CJK 符号和标点 (CJK Symbols and Punctuation)
    			"u3040-u309F" + // :日文平假名 (Hiragana)
    			"u30A0-u30FF" + // :日文片假名 (Katakana)
    			"u3100-u312F" + // :注音字母 (Bopomofo)
    			"u3130-u318F" + // :朝鲜文兼容字母 (Hangul Compatibility Jamo)
    			"u3190-u319F" + // :象形字注释标志 (Kanbun)
    			"u31A0-u31BF" + // :注音字母扩展 (Bopomofo Extended)
    			"u31C0-u31EF" + // :CJK 笔画 (CJK Strokes)
    			"u31F0-u31FF" + // :日文片假名语音扩展 (Katakana Phonetic Extensions)
    			"u3200-u32FF" + // :封闭式 CJK 文字和月份 (Enclosed CJK Letters and
    								// Months)
    			"u3300-u33FF" + // :CJK 兼容 (CJK Compatibility)
    			"u3400-u4DBF" + // :CJK 统一表意符号扩展 A (CJK Unified Ideographs
    								// Extension A)
    			"u4DC0-u4DFF" + // :易经六十四卦符号 (Yijing Hexagrams Symbols)
    			"u4E00-u9FBF" + // :CJK 统一表意符号 (CJK Unified Ideographs)
    			"uA000-uA48F" + // :彝文音节 (Yi Syllables)
    			"uA490-uA4CF" + // :彝文字根 (Yi Radicals)
    			"uA500-uA61F" + // :Vai
    			"uA660-uA6FF" + // :统一加拿大土著语音节补充 (Unified Canadian Aboriginal
    								// Syllabics Supplement)
    			"uA700-uA71F" + // :声调修饰字母 (Modifier Tone Letters)
    			"uA720-uA7FF" + // :拉丁文扩展-D (Latin Extended-D)
    			"uA800-uA82F" + // :Syloti Nagri
    			"uA840-uA87F" + // :八思巴字 (Phags-pa)
    			"uA880-uA8DF" + // :Saurashtra
    			"uA900-uA97F" + // :爪哇语 (Javanese)
    			"uA980-uA9DF" + // :Chakma
    			"uAA00-uAA3F" + // :Varang Kshiti
    			"uAA40-uAA6F" + // :Sorang Sompeng
    			"uAA80-uAADF" + // :Newari
    			"uAB00-uAB5F" + // :越南傣语 (Vi?t Thái)
    			"uAB80-uABA0" + // :Kayah Li
    			"uAC00-uD7AF" + // :朝鲜文音节 (Hangul Syllables)
    			// "uD800-uDBFF"+//:High-half zone of UTF-16
    			// "uDC00-uDFFF"+//:Low-half zone of UTF-16
    			"uE000-uF8FF" + // :自行使用区域 (Private Use Zone)
    			"uF900-uFAFF" + // :CJK 兼容象形文字 (CJK Compatibility Ideographs)
    			"uFB00-uFB4F" + // :字母表达形式 (Alphabetic Presentation Form)
    			"uFB50-uFDFF" + // :阿拉伯表达形式A (Arabic Presentation Form-A)
    			"uFE00-uFE0F" + // :变量选择符 (Variation Selector)
    			"uFE10-uFE1F" + // :竖排形式 (Vertical Forms)
    			"uFE20-uFE2F" + // :组合用半符号 (Combining Half Marks)
    			"uFE30-uFE4F" + // :CJK 兼容形式 (CJK Compatibility Forms)
    			"uFE50-uFE6F" + // :小型变体形式 (Small Form Variants)
    			"uFE70-uFEFF" + // :阿拉伯表达形式B (Arabic Presentation Form-B)
    			"uFF00-uFFEF" + // :半型及全型形式 (Halfwidth and Fullwidth Form)
    			"uFFF0-uFFFF]";// :特殊 (Specials);
    
    	/**
    	 * 将字符串转成unicode
    	 * 
    	 * @param str
    	 *            待转字符串
    	 * @return unicode字符串
    	 */
    	public static String convert(String str) {
    		str = (str == null ? "" : str);
    		String tmp;
    		StringBuffer sb = new StringBuffer(1000);
    		char c;
    		int i, j;
    		sb.setLength(0);
    		for (i = 0; i < str.length(); i++) {
    			c = str.charAt(i);
    			sb.append("\u");
    			j = (c >>> 8); // 取出高8位
    			tmp = Integer.toHexString(j);
    			if (tmp.length() == 1) {
    				sb.append("0");
    			}
    			sb.append(tmp);
    			j = (c & 0xFF); // 取出低8位
    			tmp = Integer.toHexString(j);
    			if (tmp.length() == 1) {
    				sb.append("0");
    			}
    			sb.append(tmp);
    
    		}
    		return (new String(sb).toUpperCase());
    	}
    
    	/**
    	 * 2)unicode转成字符串,与上述过程反向操作即可 将unicode 字符串
    	 * 
    	 * @param str
    	 *            待转字符串
    	 * @return 普通字符串
    	 */
    	public static String revert(String str) {
    		str = (str == null ? "" : str);
    		if (str.indexOf("\u") == -1)// 如果不是unicode码则原样返回
    			return str;
    
    		StringBuffer sb = new StringBuffer(1000);
    
    		for (int i = 0; i < str.length() - 6;) {
    			String strTemp = str.substring(i, i + 6);
    			String value = strTemp.substring(2);
    			int c = 0;
    			for (int j = 0; j < value.length(); j++) {
    				char tempChar = value.charAt(j);
    				int t = 0;
    				switch (tempChar) {
    				case 'a':
    					t = 10;
    					break;
    				case 'b':
    					t = 11;
    					break;
    				case 'c':
    					t = 12;
    					break;
    				case 'd':
    					t = 13;
    					break;
    				case 'e':
    					t = 14;
    					break;
    				case 'f':
    					t = 15;
    					break;
    				default:
    					t = tempChar - 48;
    					break;
    				}
    
    				c += t * ((int) Math.pow(16, (value.length() - j - 1)));
    			}
    			sb.append((char) c);
    			i = i + 6;
    		}
    		return sb.toString();
    	}
        /**
         * 
         * @author: Simon
         * @Description:传入需要过滤的字符
         * @date: 2017年9月22日 上午11:11:33
         * @param string
         * @return
         */
    	public static String emojiChange(String string) {
    		// System.out.println("__________________________________");
    		try {
    			// System.out.println("all-string:"+string);
    			// System.out.println("all-unicode:"+convert(string));
    			Pattern pattern = Pattern.compile(unicodeReg);
    			StringBuffer sbBuffer = new StringBuffer();
    			for (int i = 0; i < string.length(); i++) {
    				char c = string.charAt(i);
    				String temp = String.valueOf(c);
    				Matcher matcher = pattern.matcher(temp);
    				if (matcher.find()) {
    					sbBuffer.append(temp);
    				} else {
    					sbBuffer.append("□");
    				}
    				// System.out.println("temp:"+temp+";unicode:"+convert(temp));
    			}
    			// System.out.println("sb:"+sbBuffer.toString());
    			// System.out.println("--------------------------------------");
    			return sbBuffer.toString();
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		return "";
    	}
    }
    

      

  • 相关阅读:
    this.$router.push({})实现路由跳转
    JavaScript 实用小技巧
    js中如何实现数组去重--面试题
    面试中知识点积累
    从输入url到网页呈现的过程
    ptyhon学习---作业day1
    python学习---for循环
    python学习---while循环
    python学习---if else流程判断
    ajax最基础入门
  • 原文地址:https://www.cnblogs.com/SimonHu1993/p/7573868.html
Copyright © 2020-2023  润新知