import requests
import re
import json
from requests.exceptions import RequestException
def get(url):
try:
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse(html):
patter = re.compile('<li.*?cover.*?href="(.*?)"stitle="(.*?)">.*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?publisher">(.*?)</span>.*?</li>',re.S)
items = re.findall(patter,html)
for i in items:
yield {
'url': i[0],
'title': i[1],
'name': i[2].strip(),
'date': i[3].strip(),
'pulisher': i[4].strip()
}
def write_to_file (content):
with open('result.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=0)+'
')
f.close()
def main():
url = 'https://book.douban.com/'
html = get(url)
for i in parse(html):
print(i)
write_to_file(i)
if __name__ == '__main__':
main()