SEO要是和python数据分析联合在一起,可谓是很好的方法,没事的时候尝试写的分析网站被百度收录的网址和标题。
首先得引入两个py模块,分别是:Beautiful Souprequests
没有下载这两个模块的可以用以下命令下载:
pip install BeautifulSoup
pip install requests
#!/usr/bin/env python # -*- coding:utf-8 -*- ''' 百度收录网址标题查询 ''' import requests from random import randint from bs4 import BeautifulSoup import re import datetime import sys reload(sys) sys.setdefaultencoding("utf-8") HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "X-Forwarded-For": '%s:%s:%s:%s' % (randint(1, 255), randint(1, 255), randint(1, 255), randint(1, 255)), "Content-Type": "application/x-www-form-urlencoded", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Connection": "keep-alive"} # print HEADERS start_time = datetime.datetime.now() # 取当前时间 print (u'[-] 现在时间:%s') % start_time for pn in range(0, 750, 10): print ('第【%s】页')%pn url_a = 'https://www.baidu.com/s?wd=site%3Azhimo.yuanzhumuban.cc&rsv_spt=1&rsv_iqid=0xac952cfa0005be29&issp=1&f=8&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=1' joinUrl = url_a + str(pn) + url_b # print joinUrl #拼接URL html_Doc = requests.get(joinUrl, headers=HEADERS).content # 从Url 中取回网站源码 html_Soup = BeautifulSoup(html_Doc, 'html.parser', from_encoding='utf-8') all_H3 = html_Soup.findAll('h3', attrs={'class': 't'}) # 取所有H3标签中class为t的所有元系 print (u'[+] 此页共找到%s条数据!') % len(all_H3) for each in all_H3[0:]: # print each link = re.findall(r'" href="(.*?)" target="_blank">.*?</a></h3>', str(each), re.S) title = re.findall(r'" href=".*?" target="_blank">(.*?)</a>', str(each), re.S) print '[-] 标题:%s 链接:%s'%(str(title[0]), str(link[0]))