1.爬取博客园的所有随笔的url以及计数,还有对应标题
import re
import requests
from lxml.html import etree
import json
#对于链接和标题的一个整合
def func_1_deco(func_1):
def wrapper(*args,**kwargs):
dic = dict()
lis = func_1(*args,**kwargs)
count = lis[0]
url_lis = lis[1]
dic['count'] = count
name_xpath = '//*[@id="cb_post_title_url"]/text()'
for url in url_lis:
response = requests.get(url)
response = response.text
response_html = etree.HTML(response)
name = response_html.xpath(name_xpath)[0]
print(name)
dic[name] = url
return dic
return wrapper
@func_1_deco
def func(url):
lis = []
count = 1
while True:
count_1 = len(lis)
response = requests.get(f'{url}default.html?page={count}')
response = response.text
data_1 = re.findall(' href="(.*?)"', response, re.S)
for a in data_1: # type:str
if a.startswith('http'):
if a.endswith('html'):
if 'archive' not in a:
lis.append(a)
count +=1
lis = set(lis)
lis = list(lis)
count_2 = len(lis)
if count_1 == count_2:
return count_2,lis #博客的数据量,博客里面随笔的url
dic = func('你的博客的首页地址') #注意结尾要有/,字典格式是有一栏'count'计数,其他均为标题+对应的url