功能
- 一级分页列表页, 二级数据页
- 不定表头, 写入CSV
- 正则匹配, 在
()
中使用?:
实现只匹配, 不捕获
- HTTP头设置
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import re
import time
import requests
import csv
session = requests.session()
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
'Referer':'http://jtgl.beijing.gov.cn/jgj/93950/check_car/ycl/index.html',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.5,en;q=0.3,de-DE;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Cache-Control': 'max-age=0'
}
base_link = 'http://somewhere'
'''
GET /jgj/93950/check_car/ycl/22719550-2.html HTTP/1.1
Host: jtgl.beijing.gov.cn
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Language: en-US,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.5,en;q=0.3,de-DE;q=0.2
Accept-Encoding: gzip, deflate
Referer: http://jtgl.beijing.gov.cn/jgj/93950/check_car/ycl/index.html
Connection: keep-alive
Cookie: _trs_uv=kde6gvt0_365_j8m; _va_id=ec5b1ecd783dfc52.1596438844.30.1615800849.1615799023.; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22173b32ba72347-0efda1230057718-4c302273-2073600-173b32ba72436a%22%7D; _va_ref=%5B%22%22%2C%22%22%2C1615800833%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D54coDM4TQyJUBe5UqX7uRRIy7UXZ8zHoufRE-ufZk-8fMFrJRtMfs_TQ-sPD2UYtZnPmS86B0DUph3QiQ_8j0sKSmN4ZJVuCRurpig3dJ3W%26wd%3D%26eqid%3Db7d847b70004074600000003604f0468%22%5D; __jsluid_h=9c085f396ee2cc91e800331f4d8fd4a8; _va_ses=*
Upgrade-Insecure-Requests: 1
If-Modified-Since: Mon, 15 Mar 2021 06:28:19 GMT
If-None-Match: W/"604efe83-a36a"
Cache-Control: max-age=0
'''
def request_get(url, encoding='UTF-8', tout=20, retries=10):
count = 0
while True:
count += 1
if (count > retries):
print('Exceed retry limit')
return None
time.sleep(0.2)
try:
response = session.get(url, timeout=tout, headers = header)
response.encoding = encoding
#print(response.text)
return response.text
except requests.ReadTimeout:
print('ReadTimeout')
continue
except ConnectionError:
print('ConnectionError')
continue
except requests.RequestException:
print('RequestException')
continue
def lv1_to_lv2(page):
link_lv1 = base_link + '/jgj/93950/check_car/ycl/22719550-'+str(page)+'.html'
content = request_get(link_lv1, 'UTF-8', 20, 10)
links_lv2 = []
result = re.compile(r'<p class="content_li_title"><a href="[^"]+" onclick').findall(content)
if (len(result) > 0):
for line in result:
match = re.match(r'<p class="content_li_title"><a href="([^"]+)" onclick', line)
link_lv2 = match.group(1)
links_lv2.append(link_lv2)
return links_lv2
def lv2_to_data(link_lv2):
link_lv2= base_link + link_lv2
content = request_get(link_lv2, 'UTF-8', 20, 10)
data = {}
result = re.compile(r'<p class="titles">d+.?年d+月d+日全市检测场实际(?:检|验)').findall(content)
if (len(result) > 0):
match = re.match(r'<p class="titles">(d+).?年(d+)月(d+)日全市检测场实际(?:检|验)', result[0])
data['year'] = match.group(1)
data['month'] = match.group(2)
data['date'] = match.group(3)
else:
data['year'] = 0
data['month'] = 0
data['date'] = 0
content = remove_style(content)
result = re.compile(r'<tr><td>d+</td>(?:<td>[^<]+</td>)?<td>[^<]+</td><td>d+</td>(?:<td></td>)?</tr>').findall(content)
rows = []
if (len(result) > 0):
for line in result:
match = re.match(r'<tr><td>d+</td>(?:<td>[^<]+</td>)?<td>([^<]+)</td><td>(d+)</td>(?:<td></td>)?</tr>', line)
row = {}
row['name'] = match.group(1)
row['name'] = row['name'].replace(' ', '')
row['value'] = match.group(2)
rows.append(row)
data['rows'] = rows
return data
def remove_style(text):
css_pattern = re.compile('(s+|<span[^>]*>|</span>|<p[^>]*>|</p>|</?strong>|<font[^>]+>|</font>|rowspan="d+"|class="[^"]+"|style="[^"]+"|height="d+"|width="d+"|x:num="d+"|x:str="")')
return css_pattern.sub(r'', text)
# main process
csv_rows = []
csv_headers = {}
for page in range(1, 81, 1):
print(page)
links_lv2 = lv1_to_lv2(page)
if (len(links_lv2) > 0):
for link_lv2 in links_lv2:
print(link_lv2)
data = lv2_to_data(link_lv2)
csv_rows.append(data)
if (len(data['rows']) > 0):
for row in data['rows']:
if (row['name'] not in csv_headers):
csv_headers[row['name']] = 1
#time.sleep(1)
with open('output.csv', 'w', encoding='utf8', newline='') as csvfile:
fieldnames = []
fieldnames.append('date')
fieldnames.extend(csv_headers.keys())
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for csv_row in csv_rows:
row = {}
row['date'] = '{}.{}.{}'.format(csv_row['year'], csv_row['month'], csv_row['date'])
for f in csv_row['rows']:
row[f['name']] = f['value']
writer.writerow(row)