一个简单的实例,从网页中爬取整部小说!
1 import codecs 2 import urllib.request 3 import urllib 4 import re 5 6 urls=[] 7 url = "https://www.biquger.com/biquge/39691/" 8 #防爬设置 9 header = { 10 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' 11 } 12 # 请求 13 request = urllib.request.Request(url,headers=header) 14 # 爬取结果 15 response = urllib.request.urlopen(request) 16 #decode使用utf-8报错,换成gbk 17 data = response.read().decode('gbk') 18 19 #正则匹配标签 20 pattern = re.compile("href=['"]([^"'>]*?)['"].*?",re.S) 21 items = re.findall(pattern, data) 22 for item in list(items): 23 if 'https' in item and '39691' in item: ##判断是否为http协议链接,并判断是否抓取过 24 urls.append(item) 25 #print (urls) 26 print (urls[2]) 27 for i in range(2,10): 28 request = urllib.request.Request(url=urls[i], headers=header) 29 # 爬取结果 30 response = urllib.request.urlopen(request) 31 # decode使用utf-8报错,换成gbk 32 data = response.read().decode('gbk') 33 pattern = re.compile('<!--g0-->.*?<center>', re.S) 34 items = re.findall(pattern, data) 35 for item in items: 36 # 标签过滤 37 temp = item.replace('<br />', '') 38 temp = temp.replace(' ', '') 39 fo = codecs.open("foo.txt", "a",'utf-8') 40 fo.write(temp) 41 fo.close() 42 print(temp)