完成了数据爬取工作,共33335条数据,全部保存入库。
1 import requests 2 from fake_useragent import UserAgent 3 from lxml import etree 4 import re 5 import pymysql 6 import time 7 8 9 def open_file(file): 10 original_id = [] 11 f = open(file, "r") 12 for line in f: 13 original_id.append(line.splitlines()[0]) 14 f.close() 15 return original_id 16 17 18 def open_url(url, file, type): 19 original_id = open_file(file) 20 for id in original_id: 21 detail_url = url + id 22 header = { 23 "User-Agent": UserAgent().random 24 } 25 req = requests.get(detail_url, headers=header) 26 html = etree.HTML(req.text) 27 try: 28 question_title = html.xpath( 29 '//div[contains(@class,"col-xs-10 col-sm-10")]//strong/text()')[0].strip() 30 question_date = html.xpath( 31 '//div[contains(@class,"col-xs-5 col-lg-3")]/text()')[0].strip() 32 question_content = html.xpath( 33 '//div[contains(@class,"col-xs-12 col-md-12 column p-2")]//text()') 34 # if type == "投诉": 35 # reply_organ = html.xpath( 36 # '//div[contains(@class,"col-xs-9 col-sm-7")]//span/text()')[0].strip() 37 reply_organ = html.xpath( 38 '//div[contains(@class,"col-xs-9 col-sm-7")]/text()')[1].strip() 39 reply_date = html.xpath( 40 '//div[contains(@class,"col-xs-12 col-sm-3")]/text()')[0].strip() 41 reply_content = html.xpath( 42 '//div[contains(@class,"col-xs-12 col-md-12 column p-4")]//text()') 43 date_pattern = re.compile(r"(d{4}-dd-dd)") 44 print(question_title) 45 q_date = date_pattern.findall(question_date)[0] 46 print(q_date) 47 q_con = "".join(question_content).strip() 48 print(q_con) 49 print(reply_organ) 50 r_date = date_pattern.findall(reply_date)[0] 51 print(r_date) 52 r_con = "".join(reply_content).strip() 53 print(r_con) 54 print(type) 55 r = add(open_conn("letter"), 56 question_title, 57 q_date, 58 q_con, 59 reply_organ, 60 r_date, 61 r_con, 62 id, 63 type) 64 print(r) 65 except IndexError as e: 66 pass 67 time.sleep(0.5) 68 print("=" * 20) 69 70 71 def open_conn(dbname): 72 db = pymysql.connect( 73 host="localhost", 74 port=3306, 75 user="root", 76 passwd="123456", 77 db=dbname, 78 charset="utf8") 79 80 return db 81 82 83 def add( 84 db, 85 question_title, 86 question_date, 87 question_content, 88 reply_organ, 89 reply_date, 90 reply_content, 91 original_id, 92 type): 93 94 cursor = db.cursor() 95 sql = "insert into detail_letter(question_title,question_date,question_content,reply_organ,reply_date,reply_content,original_id,type) values(%s,%s,%s,%s,%s,%s,%s,%s)" 96 cursor.execute( 97 sql, 98 [question_title, 99 question_date, 100 question_content, 101 reply_organ, 102 reply_date, 103 reply_content, 104 original_id, 105 type]) 106 db.commit() 107 db.close() 108 return "数据插入成功!" 109 110 111 if __name__ == '__main__': 112 # open_url( 113 # "http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId=", 114 # "tousu.txt", 115 # "投诉") 116 # open_url( 117 # "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=", 118 # "zixun.txt", 119 # "咨询") 120 open_url( 121 "http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId=", 122 "jianyi.txt", 123 "建议")