在用net core写爬虫的时候,发现默认不再支持gb2312编码了:
解决方案如下:
1,引入System.Text.Encoding.CodePages:
2,在需要的地方注册EncodingProvider的方法;
3,调用 Encoding.GetEncoding("GB2312").GetString(pageSource);
public void CityCrawler(string allCityUrl)
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
var cityList = new List<string>();
var pageSource = new HttpClient(new HttpClientHandler { AutomaticDecompression = DecompressionMethods.GZip })
.GetByteArrayAsync(allCityUrl).Result;
var result = Encoding.GetEncoding("GB2312").GetString(pageSource);
var cities = Regex.Matches(result,
"([u4e00-u9fa5]{2,5})", "spell": "[A-Za-z]+", "url": "//([A-Za-z]{2,}.esf.fang.com)");
for (int index = 0; index < cities.Count; index++)
{
var city = cities[index].Groups[1].Value + ":" + "https://" + cities[index].Groups[2].Value;
cityList.Add(city);
}
File.WriteAllLines("房天下城市列表.txt", cityList);
}