package org.jimmy.autosearch20180821.test; import java.util.regex.Matcher; import java.util.regex.Pattern; public class TestUrlRegularExpression { public static void main(String[] args) { //默认是贪婪匹配,暂时还没尝试写一个正则匹配 String urlRegex = "https?://(\w|-)+(\.(\w|-)+)+(/(\w|-)+(/((\w|-)+\.(\w|-)+)|/(\w|-)*)|(/((\w|-)+\.(\w|-)+)|/(\w|-)+)|/?)/(((\w|-)+\.(\w|-)+)|(\w|-)+(\?\w+=(\w|-|%|[u4e00-u9fa5])+(\&\w+=(\w|-|%|[u4e00-u9fa5])+)*)?)";
修改为:
urlRegex = "https?://(\w|-)+(\.(\w|-)+)+(/(\w|-)+(/((\w|-)+\.(\w|-)+)|/(\w|-)*)|(/((\w|-)+\.(\w|-)+)|/(\w|-)+)|/?)/(((\w|-)+\.(\w|-)+)|(\w|-)*(\?\w+=(\w|-|%|[u4e00-u9fa5])+(\&\w+=(\w|-|%|[u4e00-u9fa5])+)*)?)";
urlRegex = "https?://(\w|-)+(\.(\w|-)+)+(/(\w+(\?(\w+=(\w|%|-)*(\&\w+=(\w|%|-)*)*)?)?)?)+";//修改版 String urlRegex2 = "https?://(\w|-)+(\.(\w|-)+)+(/(\w|-)+(/((\w|-)+\.(\w|-)+)|/(\w|-)*)|(/((\w|-)+\.(\w|-)+)|/(\w|-)+)|/?)"; //我在百度搜索了java 正则表达式,然后复制了放在url里面的 String url = "https://www.baidu.com/s?wd=java%20%E6%AD%A3%E5%88%99%E8%A1%A8%E8%BE%BE%E5%BC%8F&rsv_spt=1&rsv_iqid=0xf233885e000326c0&issp=1&f=8&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=24&rsv_sug1=8&rsv_sug7=100&rsv_t=0d31XJ5IR0T98Bv150wUMKQHirYYsh2IgKsJFk0FH4wGur10ND3LypRnWtdrcFCsDH%2F3&rsv_sug2=0&inputT=6942&rsv_sug4=6942"; //这个就是不带参数的url了 String url2 = "https://www.baidu.com"; Pattern pattern = Pattern.compile(urlRegex); Matcher matcher = pattern.matcher(url); String isSuccess = "不匹配"; if(matcher.matches()){ isSuccess = "匹配"; } System.out.println(isSuccess); pattern = Pattern.compile(urlRegex2); matcher = pattern.matcher(url2); isSuccess = "不匹配"; if(matcher.matches()){ isSuccess = "匹配"; } System.out.println(isSuccess); } }
还是直接上代码.
运行结果见下图:
修复了之前不能匹配微软必应搜索的url地址.