# coding=utf-8 #@auther:Mana_菜小刀 import requests import queue import threading import xlrd import xlwt from lxml import etree from xlutils.copy import copy from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" } myxls = xlwt.Workbook() sheet1 = myxls.add_sheet('收录search') lst_name = ['url', '收录/未收录', '图片'] for i in range(len(lst_name)): sheet1.write(0, i, lst_name[i]) myxls.save('result.xls') def log(*args,**kwargs): print(*args,**kwargs) class baiduSpider(threading.Thread): def __init__(self, queue_li, name): threading.Thread.__init__(self) self._queue = queue_li self._name = name def run(self): while not self._queue.empty(): url = self._queue.get() try: self.get_url(url) except Exception as e: log(e) pass def get_url(self,url): requests.adapters.DEFAULT_RETRIES = 5 r = requests.session() r.keep_alive = False s = r.get(url=url, headers=headers) #log(s) xpather = etree.HTML(s.text) strs = xpather.xpath('//span[@class="nums_text"]//text()') imgs = xpather.xpath('//img[@class="c-img c-img6"]/@src') #log(strs, imgs) search_mo = ['收录','未收录'] img_mo = ['有图','无图'] url_mo = url.replace('http://www.baidu.com/s?wd=','') workbook = xlrd.open_workbook('result.xls', formatting_info=True) sheet = workbook.sheet_by_index(0) rowNum = sheet.nrows colNum = sheet.ncols newbook = copy(workbook) newsheet = newbook.get_sheet(0) if strs[0] != "百度为您找到相关结果约0个" and len(imgs) > 0: newsheet.write(rowNum,0,url_mo) newsheet.write(rowNum, 1, search_mo[0]) newsheet.write(rowNum, 2, img_mo[0]) log(search_mo[0],'丨',img_mo[0],'丨',url_mo) #newbook.save('result.xls') elif strs[0] != "百度为您找到相关结果约0个" and len(imgs) == 0: newsheet.write(rowNum, 0, url_mo) newsheet.write(rowNum, 1, search_mo[0]) newsheet.write(rowNum, 2, img_mo[1]) log(search_mo[0],'丨',img_mo[1],'丨',url_mo) #newbook.save('result.xls') else: newsheet.write(rowNum, 0, url_mo) newsheet.write(rowNum, 1, search_mo[1]) newsheet.write(rowNum, 2, img_mo[1]) log(search_mo[1],'丨',img_mo[1],'丨',url_mo) newbook.save('result.xls') def main(): queue_li = queue.Queue() threads = [] thread_count = 10 myxls = xlwt.Workbook() sheet1 = myxls.add_sheet('IDF') '''把'urls'改成自己的txt文档名称:''' with open('urls', 'r', encoding='utf-8', errors="ignore") as f: content = f.read() urls = content.split(' ') for url in urls: if len(url) > 0: url_search = url queue_li.put('http://www.baidu.com/s?wd={}'.format(url_search)) for i in range(thread_count): spider = baiduSpider(queue_li, url_search) threads.append(spider) for i in threads: i.start() for i in threads: i.join() '''log("Mana好伟大!(^-^)V")''' if __name__ == '__main__': log("Mana好伟大!(^-^)V") main()