No1:
# -*- coding:utf-8 -*- import urllib.request import urllib ua_headers = {"User-Agent": "..."} request = urllib.request.Request("http://www.baidu.com", headers=ua_headers) response = urllib.request.urlopen(request) html = response.read() print(html)
No2:
# -*- coding:utf-8 -*- import urllib.request from urllib import parse def loadPage(url, filename): print("正在下载" + filename) headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} request = urllib.request.Request(url, headers=headers) return urllib.request.urlopen(request).read() def writePage(html, filename): print("正在保存" + filename) with open(filename, "wb+") as f: f.write(html) print("-" * 30) def tiebaSpider(url, beginPage, endPage): for page in range(beginPage, endPage + 1): pn = (page - 1) * 50 filename = "第" + str(page) + "页.html" fullurl = url + "&pn=" + str(pn) html = loadPage(fullurl, filename) writePage(html, filename) print("谢谢使用") if __name__ == '__main__': kw = input("请输入需要爬去的贴吧名") beginPage = int(input("请输入起始页:")) endPage = int(input("请输入结束页")) url = "http://tieba.baidu.com/f?" key = parse.urlencode({"kw": kw}) fullurl = url + key tiebaSpider(fullurl, beginPage, endPage)
No3:
#!/usr/bin/env python # -*- coding:utf-8 -*- import urllib.request from urllib import parse # 通过抓包的方式获取的url,并不是浏览器上显示的url url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null" # 完整的headers headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", } # 用户接口输入 key = input("请输入需要翻译的文字:") # 发送到web服务器的表单数据 formdata = { "type": "AUTO", "i": key, "doctype": "json", "xmlVersion": "1.8", "keyfrom": "fanyi.web", "ue": "UTF-8", "action": "FY_BY_CLICKBUTTON", "typoResult": "true" } # 经过urlencode转码 data = parse.urlencode(formdata).encode(encoding='UTF8') # 如果Request()方法里的data参数有值,那么这个请求就是POST # 如果没有,就是Get request = urllib.request.Request(url, data=data, headers=headers) print(str(urllib.request.urlopen(request).read(), 'utf-8'))
No4:
ajax
#!/usr/bin/env python # -*- coding:utf-8 -*- import urllib.request from urllib import parse url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} formdata = { "start": "0", "limit": "20" } data = parse.urlencode(formdata).encode(encoding='utf-8') request = urllib.request.Request(url, data=data, headers=headers) print(str(urllib.request.urlopen(request).read(), 'utf-8'))
No5:
handler
import urllib.request http_handler = urllib.request.HTTPHandler(debuglevel=1) opener = urllib.request.build_opener(http_handler) request = urllib.request.Request("http://www.baidu.com/") response = opener.open(request) print(str(response.read(), 'utf-8'))
No6:
proxy
import urllib.request proxyswitch = True httpproxy_handler = urllib.request.ProxyHandler({"http": "222.22.66.211"}) nullproxy_handler = urllib.request.ProxyHandler({}) if proxyswitch: opener = urllib.request.build_opener(httpproxy_handler) else: opener = urllib.request.build_opener(nullproxy_handler) urllib.request.install_opener(opener) request = urllib.request.Request("http://www.baidu.com/") response = urllib.request.urlopen(request) print(str(response.read(), 'utf-8'))
No7:
http
import urllib.request test = "test" password = "123456" webserver = "192.168.21.52" passwordMgr = urllib.request.HTTPPasswordMgrWithDefaultRealm() passwordMgr.add_password(None, webserver, test, password) httpauth_handler = urllib.request.HTTPBasicAuthHandler(passwordMgr) opener = urllib.request.build_opener(httpauth_handler) request = urllib.request.Request("http://" + webserver) response = opener.open(request) print(str(response.read(), 'utf-8'))
No8:
cookie
from urllib import request from urllib import parse from http import cookiejar cookie = cookiejar.CookieJar() cookie_handler = request.HTTPCookieProcessor(cookie) opener = request.build_opener(cookie_handler) opener.addheaders = [("User-Agent", "xxx")] url = "http://www.renren.com/PLogin.do" data = {"email": "xxx@163.com", "password": "xxx"} data = parse.urlencode(data).encode(encoding='UTF-8') request = request.Request(url, data=data) response = opener.open(request) print(str(response.read(), 'utf-8'))
No9:
抓取内涵段子
from urllib import request import re class Spider: def __init__(self): self.page = 1 self.switch = True def loadPage(self): print("正在下载数据...") url = "http://xiaohua.zol.com.cn/new/" + str(self.page) + ".html" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} req = request.Request(url, headers=headers) response = request.urlopen(req) html = str(response.read(), "gbk") pattern = re.compile('<divsclass="summary-text">(.*?)</div>', re.S) content_list = pattern.findall(html) self.dealPage(content_list) def dealPage(self, content_list): for item in content_list: item = item.replace("<p>", "").replace("</p>", "").replace("<br>", "").replace('<p class="bbsp">', "").replace(" ", "") self.writePage(item) def writePage(self, item): print("正在写入数据...") with open("duanzi.txt", "a") as f: f.write(item) def startWork(self): while self.switch: self.loadPage() command = input("如果继续爬取,请按回车(退出输入quit)") if command == "quit": self.switch = False self.page += 1 print("谢谢使用!") if __name__ == "__main__": duanzi = Spider() duanzi.startWork()
No10:
抓取百度贴吧美女图
#!/usr/bin/env python # -*- coding:utf-8 -*- from urllib import request from urllib import parse from lxml import etree def loadPage(url): print("正在下载...") req = request.Request(url) html = request.urlopen(req).read() # 解析HTML文档为HTML DOM模型 content = etree.HTML(html) # 返回所有匹配成功的列表集合 link_list = content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href') # link_list = content.xpath('//a[@class="j_th_tit"]/@href') for link in link_list: fulllink = "http://tieba.baidu.com" + link # 组合为每个帖子的链接 print("link=" + link) loadImage(fulllink) # 取出每个帖子里的每个图片连接 def loadImage(link): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} req = request.Request(link, headers=headers) html = request.urlopen(req).read() # 解析 content = etree.HTML(html) # 取出帖子里每层层主发送的图片连接集合 link_list = content.xpath('//img[@class="BDE_Image"]/@src') # link_list = content.xpath('//div[@class="post_bubble_middle"]') # link_list = content.xpath('//img[@class="BDE_Image"]/@src') # 取出每个图片的连接 for link in link_list: print("imglink" + link) writeImage(link) def writeImage(link): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} # 文件写入 req = request.Request(link, headers=headers) # 图片原始数据 image = request.urlopen(req).read() # 取出连接后10位做为文件名 filename = link[-10:] # 写入到本地磁盘文件内,美女 with open(filename, "wb") as f: f.write(image) print("已经成功下载 " + filename) def tiebaSpider(url, beginPage, endPage): for page in range(beginPage, endPage + 1): pn = (page - 1) * 50 # filename = "第" + str(page) + "页.html" fullurl = url + "&pn=" + str(pn) # print fullurl loadPage(fullurl) # print html print("谢谢使用") if __name__ == "__main__": kw = input("请输入需要爬取的贴吧名:") beginPage = int(input("请输入起始页:")) endPage = int(input("请输入结束页:")) url = "http://tieba.baidu.com/f?" key = parse.urlencode({"kw": kw}) fullurl = url + key tiebaSpider(fullurl, beginPage, endPage)
No11:
抓取百度图片
# -*- coding: utf-8 -*- """根据搜索词下载百度图片""" import re import sys import urllib import requests def get_onepage_urls(onepageurl): """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url""" if not onepageurl: print('已到最后一页, 结束') return [], '' try: html = requests.get(onepageurl) html.encoding = 'utf-8' html = html.text except Exception as e: print(e) pic_urls = [] fanye_url = '' return pic_urls, fanye_url pic_urls = re.findall('"objURL":"(.*?)",', html, re.S) fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0) fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else '' return pic_urls, fanye_url def down_pic(pic_urls): """给出图片链接列表, 下载所有图片""" for i, pic_url in enumerate(pic_urls): try: pic = requests.get(pic_url, timeout=15) string = str(i + 1) + '.jpg' with open(string, 'wb') as f: f.write(pic.content) print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url))) except Exception as e: print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url))) print(e) continue if __name__ == '__main__': keyword = '中国美女' # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样 url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word=' url_init = url_init_first + urllib.parse.quote(keyword, safe='/') all_pic_urls = [] onepage_urls, fanye_url = get_onepage_urls(url_init) all_pic_urls.extend(onepage_urls) fanye_count = 0 # 累计翻页数 while 1: onepage_urls, fanye_url = get_onepage_urls(fanye_url) fanye_count += 1 # print('第页' % str(fanye_count)) if fanye_url == '' and onepage_urls == []: break all_pic_urls.extend(onepage_urls) down_pic(list(set(all_pic_urls)))
No12:
知乎登录
from bs4 import BeautifulSoup import requests import time def captcha(captcha_data): with open("captcha.jpg", "wb") as f: f.write(captcha_data) text = input("请输入验证码:") return text def zhihuLogin(): sess = requests.Session() headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} html = sess.get("https://www.zhihu.com/#signin", headers=headers).text bs = BeautifulSoup(html, "lxml") _xsrf = bs.find("input", attrs={"name": "_xsrf"}).get("value") captcha_url = "https://www.zhihu.com/captcha.gif?r=%dtype=login" % (time.time() * 1000) captcha_data = sess.get(captcha_url, headers=headers).content text = captcha(captcha_data) data = { "_xsrf": _xsrf, "email": "", "password": "", "captcha": text } response = sess.post("https://www.zhihu.com/login/email", data=data, headers=headers) print(response.text) if __name__ == "__main__": zhihuLogin()
No13:
json解析
import urllib.request import json import jsonpath url = "http://www.lagou.com/lbs/getAllCitySearchLabels.json" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) html = response.read() unicodestr = json.loads(html) city_list = jsonpath.jsonpath(unicodestr, "$..name") for item in city_list: print(item) array = json.dumps(city_list, ensure_ascii=False) with open("lagoucity.json", "wb+") as f: f.write(array.encode("utf-8"))
No14:
xml解析
# -*- coding:utf-8 -*- import urllib.request from lxml import etree import json url = "http://www.qiushibaike.com/8hr/page/1/" headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} request = urllib.request.Request(url, headers=headers) html = urllib.request.urlopen(request).read() text = etree.HTML(html) node_list = text.xpath('//div[contains(@id, "qiushi_tag")]') items = {} for node in node_list: username = node.xpath('./div/a/@title')[0] image = node.xpath('.//div[@class="thumb"]//@src') content = node.xpath('.//div[@class="content"]/span')[0].text zan = node.xpath('.//i')[0].text comments = node.xpath('.//i')[1].text items = { "username": username, "image": image, "content": content, "zan": zan, "comments": comments } with open("qiushi.json", "ab+") as f: f.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + b" ")
No15:
多线程
# -*- coding:utf-8 -*- import threading from queue import Queue from lxml import etree import requests import json class ThreadCrawl(threading.Thread): def __init__(self, threadName, pageQueue, dataQueue): super(ThreadCrawl, self).__init__() self.threadName = threadName self.pageQueue = pageQueue self.dataQueue = dataQueue self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} def run(self): print("启动" + self.threadName) while not CRAWL_EXIT: try: page = self.pageQueue.get(False) url = "http://www.qiushibaike.com/8hr/page/" + str(page) + "/" content = requests.get(url, headers=self.headers) self.dataQueue.put(content) except: pass print("结束" + self.threadName) class ThreadParse(threading.Thread): def __init__(self, threadName, dataQueue, filename): super(ThreadParse, self).__init__() self.threadName = threadName self.dataQueue = dataQueue self.filename = filename def run(self): while not PARSE_EXIT: try: html = self.dataQueue.get(False) self.parse(html) except: pass def parse(self, html): html = etree.HTML(html) node_list = text.xpath('//div[contains(@id, "qiushi_tag")]') items = {} for node in node_list: username = node.xpath('./div/a/@title')[0] image = node.xpath('.//div[@class="thumb"]//@src') content = node.xpath('.//div[@class="content"]/span')[0].text zan = node.xpath('.//i')[0].text comments = node.xpath('.//i')[1].text items = { "username": username, "image": image, "content": content, "zan": zan, "comments": comments } with open("qiushi.json", "ab+") as f: f.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + b" ") CRAWL_EXIT = False PARSE_EXIT = False def main(): pageQueue = Queue(10) for i in range(1, 11): pageQueue.put(i) dataQueue = Queue() filename = open("duanzi.json", "a") crawList = ["采集线程1号", "采集线程2号", "采集线程3号"] threadcrawl = [] for threadName in crawList: thread = ThreadCrawl(threadName, pageQueue, dataQueue) thread.start() threadcrawl.append(thread) parseList = ["解析线程1号", "解析线程2号", "解析线程3号"] threadparse = [] for threadName in parseList: thread = ThreadParse(threadName, dataQueue, filename) Thread.start() threadparse.append(thread) while not pageQueue.empty(): pass global CRAWL_EXIT CRAWL_EXIT = True for thread in threadcrawl: thread.join() print("1") if __name__ == "__main__": main()
No16:
selenium、webdriver
from selenium import webdriver from selenium.webdriver.common.keys import Keys driver = webdriver.PhantomJS(executable_path=r'D:phantomjsphantomjs-2.1.1-windowsinphantomjs.exe') driver.get("http://www.baidu.com/") driver.find_element_by_id("kw").send_keys(u"中国美女") # driver.find_element_by_id("su").click() driver.find_element_by_id("su").send_keys(Keys.ENTER) driver.save_screenshot("girl.png") driver.get_cookies() print(driver.page_source)
from selenium import webdriver from selenium.webdriver.common.keys import Keys driver = webdriver.PhantomJS(executable_path=r'D:phantomjsphantomjs-2.1.1-windowsinphantomjs.exe') driver.get("https://www.douban.com/") driver.find_element_by_name("form_email").send_keys("mr.mao.tony@gmail.com") driver.find_element_by_name("form_password").send_keys("Mzj60055969alarm") driver.find_element_by_id("captcha_field").send_keys("short") driver.find_element_by_class_name("bn-submit").click() driver.save_screenshot("douban.png")
No17:
unittest测试
from selenium import webdriver import unittest from bs4 import BeautifulSoup as bs class douyu(unittest.TestCase): def setUp(self): self.driver = webdriver.PhantomJS(executable_path=r'D:phantomjsphantomjs-2.1.1-windowsinphantomjs.exe') def testDouyu(self): self.driver.get("https://www.douyu.com/directory/all") while True: soup = bs(self.driver.page_source, "lxml") names = soup.find_all("h3", {"class": "ellipsis"}) numbers = soup.find_all("span", {"class", "dy-num fr"}) for name, number in zip(names, numbers): print(u"观众人数:" + number.get_text().strip() + u" 房间名:" + name.get_text().strip()) if self.driver.page_source.find("shark-pager-disable-next") != -1: break self.driver.find_element_by_class_name("shark-pager-next").click() def tearDown(self): self.driver.quit() if __name__ == "__main__": unittest.main()
No18:
执行js
#!/usr/bin/env python # -*- coding:utf-8 -*- from selenium import webdriver import time driver = webdriver.PhantomJS(executable_path=r'D:phantomjsphantomjs-2.1.1-windowsinphantomjs.exe') driver.get("https://movie.douban.com/typerank?type_name=剧情&type=11&interval_id=100:90&action=") time.sleep(30) # 向下滚动10000像素 js = "document.body.scrollTop=10000" #js="var q=document.documentElement.scrollTop=10000" #查看页面快照 driver.save_screenshot("douban.png") # 执行JS语句 driver.execute_script(js) time.sleep(20) #查看页面快照 driver.save_screenshot("newdouban.png") driver.quit()
No19:
tesseract 识别图片中文字-验证码