1.PyQuery
1 from pyquery import PyQuery as pq 2 3 # url初始化 4 # html = '' 5 # doc = pq(html) 6 url = 'https://www.baidu.com' 7 doc = pq(url=url) 8 print(doc('head')) 9 10 # 文件初始化 11 doc = pq(filename='xxx.html') 12 print(doc('li')) 13 14 # 基本css选择器 15 doc('#id .class a') # id 下 class 的 li 标签 16 doc('#id .class.a') # 并列表示同级 17 18 # 查找元素 19 item = doc('.class') 20 lis = item.find('list') # 子元素 21 lis = item.children() # 直接子元素 不常用 22 lis = item.children('.class') 23 24 lis = item.parent() #父元素 25 lis = item.parents() #祖先节点 26 lis = item.parents('li') 27 28 item.siblings() #兄弟节点 29 item.siblings('') #兄弟节点 30 31 32 # 遍历 33 lst = doc('li').items() #生成器 34 for ls in lst: 35 pass 36 37 # 获取属性 38 lis.attr('href') 39 lis.attr.href 40 41 # 获取文本 42 lis.text() 43 44 # 获取html 45 lis.html() 46 47 # dom操作 48 lis.remove_class('.class') 49 lis.add_class('.class') 50 51 lis.attr('name','link') # 加一个name='link' 的属性 52 lis.css('font-size','14px') # 加一个css属性 53 54 lis.find('p').remove() # 删除p标签 55 56 # 伪类选择器 57 doc('li:first-child') # 第一个元素 58 doc('li:last-child') # 最后一个元素 59 doc('li:child(2)') # 第二个元素 60 doc('li:gt(2)') # 第二个元素以上 61 doc('li:nth-child(2n)') # 偶数元素 62 doc('li:contains(second)') # 包含second 文本的
2.requests
1 import requests 2 url = 'https://www.baidu.com' 3 resp = requests.get(url) 4 print(resp.cookies) 5 print(resp.text) 6 7 # get 8 9 data = { 10 '':'', 11 '':'' 12 } 13 resp = requests.get(url,params=data) 14 # 解析json 15 print(resp.json()) #print(json.loads(resp.text) 16 17 # 获取二进制数据 18 print(resp.content) 19 with open('','wb') as f: 20 f.write(resp.content) 21 22 # 添加headers 23 headers = {'User-Agent':''} 24 resp = resp.get(url,headers=headers) 25 26 27 28 # post 29 data = {} 30 resp = requests.post(url,data=data) 31 resp = requests.post(url,data=data,headers=headers) 32 33 34 # >>>高级操作 35 # 1.文件上传 36 files = {'file':open('','rb')} 37 resp = requests.post(url,files=files) 38 # 2.获取cookie4 39 for key,value in resp.cookies.items(): 40 print(key+'='+value) 41 # 3.会话维持 42 import requests 43 # requests.get('https://httpbin.org/cookies/set/number/12346789') 44 # resp = requests.get('https://httpbin.org/cookies') 45 s = requests.Session() 46 s.get('https://httpbin.org/cookies/set/number/12346789') 47 resp = s.get('https://httpbin.org/cookies') 48 49 # 4.证书验证 50 import requests 51 resp = requests.get('https://www.12306.cn',verify=False) 52 resp = requests.get('https://www.12306.cn',cert=('/path/server,crt','/path/key')) 53 54 # 5.代理设置 55 import requests 56 proxies = { 57 'http':'http://127.0.0.1:9473', 58 'https':'https://127.0.0.1:9473', 59 'http':'http://uesr:password@127.0.0.1:9473' #带有用户名密码的代理 60 } 61 resp = requests.get(url,proxies=proxies) 62 63 64 # 6.认证设置 65 import requests 66 from requests.auth import HTTPBasicAuth 67 resp = requests.get(url,auth=HTTPBasicAuth('user','123')) 68 69 70 import requests 71 resp = requests.get(url,auth=('','')) 72 73 74 # 7.异常处理 75 from requests.exceptions import ReadTimeout,ConnectionError,RequestException 76 try: 77 pass 78 except ReadTimeout: 79 pass 80 except ConnectionError: 81 pass 82 except RequestException: 83 pass
4.selenium
1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 from selenium.webdriver.common.keys import Keys 4 from selenium.webdriver.support import expected_conditions as EC 5 from selenium.webdriver.support.wait import WebDriverWait as wdw 6 7 url = 'https://www.baidu.com' 8 browser = webdriver.Chrome() 9 try: 10 browser.get(url) 11 input = browser.find_element_by_id('kw') 12 input.send_keys('Python') 13 input.send_keys(Keys.ENTER) 14 # input.clear() 15 wait = wdw(browser,10) 16 wait.until(EC.presence_of_element_located((By.ID,'content_left'))) 17 print(browser.current_url) 18 print(browser.get_cookies()) 19 print(browser.page_source) 20 finally: 21 browser.close() 22 23 24 # 声明浏览器对象 25 browser = webdriver.Chrome() 26 browser = webdriver.Firefox() 27 browser = webdriver.Edge() 28 browser = webdriver.PhantomJS() 29 browser = webdriver.Safari() 30 31 # 查找元素 32 browser.find_element_by_id('q') 33 browser.find_element_by_css_selector('#q') 34 browser.find_element_by_xpath('//*[@id="q"]') 35 browser.find_element('By.ID','q') 36 37 # 多个元素 38 browser.find_elements(By.CSS_SELECTOR,'.class li') 39 browser.find_elements_by_css_selector('.class li') 40 41 # 元素交互操作 42 button = browser.find_element_by_class_name('') 43 button.click() 44 45 # 交互动作 46 from selenium.webdriver import ActionChains 47 browser = webdriver.Chrome() 48 url = '' 49 browser.get(url) 50 browser.switch_to('') 51 source = browser.find_element_by_css_selector('#') 52 target = browser.find_element_by_css_selector('#') 53 actions = ActionChains(browser) 54 actions.drag_and_drop(source,target) 55 actions.perform() 56 57 # 执行javaScript 58 browser.execute_script('alert()') 59 60 # 获取元素信息 61 logo = browser.find_element_by_css_selector('#') 62 logo.get_attribute('class') 63 64 # 获取文本值 65 logo.text() 66 67 # 获取id 位置 标签名 大小 68 69 logo.location 70 logo.id 71 logo.tag_name 72 logo.size 73 74 # Frame 75 from selenium.common.exceptions import NoSuchElementException 76 77 browser.switch_to.frame('') 78 browser.switch_to.parent_frame('') 79 80 # 等待 81 # 隐式等待 82 browser.implicitly_wait(10) # 超出10秒 异常 83 84 # 显示等待 常用 85 wait = wdw(browser,) 86 wait.until(EC.presence_of_element_located((By.ID,'q'))) 87 wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,''))) 88 89 90 # 前进 后退 91 browser.back() 92 browser.forward() 93 94 # cookies 95 browser.get_cookie() 96 browser.add_cookie() 97 browser.delete_all_cookies() 98 99 # 选项卡管理 100 browser.execute_script('windows.open()') # 打开 101 browser.switch_to_window(browser.window_handles[1]) #切换 102 103 # 异常处理 104 from selenium.common.exceptions import TimeoutException,NoSuchElementException 105 try: 106 pass 107 except TimeoutException: 108 pass 109 except NoSuchElementException: 110 pass 111 finally: 112 browser.close()
4.re
1 import re 2 3 # match--- 从起始位置开始匹配 4 content = 'Hello 123 4567 World_This is a Ragex Demo' 5 result = re.match('^Hellosdddsd{4}sw{10}.*$',content) 6 print(result) 7 print(result.group()) 8 print(result.span()) 9 10 11 #复用 12 pattern = re.compile('^Hello.*Demo$',re.S) 13 result = re.match(pattern,content) 14 15 # 泛匹配 16 result = re.match('^Hello.*Demo$',content) 17 # (d+) .不能匹配换行符 18 # 贪婪匹配 .* 非贪婪匹配 .? 19 20 21 # search--- 返回第一个成功匹配的内容 22 # findall--- 以列表形式返回能全部匹配的目标 23 # sub --- 替换 24 # compile --- 将正则字符串编译成正则表达式 25 26 # 尽量使用泛匹配,使用()得到匹配目标,尽量使用非贪婪模式,有换行符用re.S,能用search就不用match 27 28 29 30 print('实战'+20*'-') 31 import requests 32 content = requests.get('https://book.douban.com/').text 33 print(content)
5.urllib
1 ''' 2 >>>urllib库 3 ---urllib.request 请求模块 4 ---urllib.error 异常处理模块 5 ---urllib.parse url解析模块 6 ---(urllib.robotparser robots.txt解析模块) - --非重点 7 ''' 8 9 url = 'https://www.baidu.com' 10 11 ### get请求 12 import urllib.request 13 resp = urllib.request.urlopen(url) 14 print(resp.read().decode('utf-8')) 15 16 ### post请求 17 import urllib.parse 18 import urllib.request 19 data = bytes(urllib.parse.urlencode({'word':'hello'},encoding='utf8')) 20 resp = urllib.request.urlopen(url, data=data) 21 print(resp.read()) 22 23 ### 异常 24 import urllib.request 25 import urllib.error 26 import socket 27 try: 28 resp = urllib.request.urlopen(url, timeout=0.1) 29 except urllib.error.URLError as e: 30 if isinstance(e.reason, socket.timeout): 31 print('TIME OUT') 32 33 ### 响应 34 resp.status 35 resp.getheaders() 36 resp.getheaders('Server') 37 38 ### 加参数 39 ### Request 40 import urllib.request 41 request = urllib.request.Request(url) 42 resp = urllib.request.urlopen(request) 43 44 45 from urllib import request, parse 46 headers = { 47 'User-Agent': ''} # request.add_header('','') 48 data = {'': ''} 49 req = request.Request(url=url, data=data, headers=headers, method='POST') 50 resp = request.urlopen(req) 51 52 ### 代理(handler) 53 from urllib import request 54 55 proxy_handler = request.ProxyHandler({ 56 'http': '//xxx.x.x.x:xxxx', 57 'https':'//xxx.x.x.x: xxxx' 58 }) 59 opener = request.bulid_opener(proxy_handler) 60 resp = opener.open(url) 61 62 ### cookie 63 import http.cookiejar, urllib.request 64 cookie = http.cookiejar.CookieJar() 65 handler = urllib.request.HTTPCookieProcessor(cookie) 66 opener = urllib.request.bulid_opener(handler) 67 resp = opener.open(url) 68 for item in cookie: 69 print(item.name + '=' + item.value) 70 71 72 import http.cookiejar,urllib.request 73 filename ='cookie.txt' 74 cookie = http.cookiejar.MozillaCookieJar(filename) #http.cookiejar.LWPCookieJar(filename) 75 handler = urllib.request.HTTPCookieProcessor(cookie) 76 opener = urllib.request.build_opener(handler) 77 resp = opener.open(url) 78 cookie.save(ignore_discard=True,ignore_expires=True) 79 80 81 import http.cookiejar,urllib.request 82 cookie = http.cookiejar.LWPCookieJar() 83 cookie.load('cookie.txt',ignore_expires=True,ignore_discard=True) 84 handler = urllib.request.HTTPCookieProcessor(cookie) 85 opener = urllib.request.build_opener(handler) 86 resp = opener.open(url) 87 88 89 90 91 # **** 重点 **** 92 # urlencode 93 from urllib.parse import urlencode 94 params = { 95 '':'', 96 '':'' 97 } 98 base_url = 'https://www.baidu.com?' 99 url = base_url + urlencode(params) 100 101 # url解析 102 from urllib.parse import urlparse 103 result = urlparse(url) #协议 默认https 104 result = urlparse(url,scheme='https') 105 result = urlparse(url,allow_fragments=False)
View Code
6.xpath
7.bs
1 from bs4 import BeautifulSoup as bs 2 url = '' 3 soup = bs(url,'lxml') 4 soup.prettify() # 格式化 5 soup.title.string # title标签的内容 6 7 8 #>>>>标签选择器 9 # 选择元素 10 print(soup.head) # head标签 11 print(soup.p) # p标签 12 # 获取名称 13 soup.title.name # --->title 14 # 获取属性 15 soup.p.attrs('name') # p标签的name属性 16 soup.p('name') 17 # 获取内容 18 soup.p.string # p标签的内容 19 # 嵌套选择 20 soup.head.title.string # head标签下的title标签的内容 21 # 子孙节点 22 soup.p.contents # p标签的子节点 23 soup.p.children # 以迭代器的方式返回子节点 24 for i,child in enumerate(soup.p.children): 25 print(i,child) 26 soup.p.descendants # 子孙节点 迭代器 27 # 父节点/祖先节点 28 soup.p.parent # 父节点 29 soup.p.parents # 祖先节点 30 # 兄弟节点 31 soup.a.next_siblings 32 soup.a.previous_siblings 33 34 35 #>>>> 标准选择器 36 # find_all 37 soup.find_all('li') # 所有的li标签 38 soup.find_all('li')[0] # 第一个li标签 39 # attrs 40 soup.find_all(attrs={'id':'list_a'}) # id = list_a 的所有标签 41 soup.find_all(id='list_a') # id = list_a 的所有标签 42 soup.find_all(class_='list_a') # class = list_a 的所有标签 43 # text 44 soup.find_all(text='') # 标签文本内容 45 46 #find_next_siblings() 47 #find_next_sibling() 48 #find_previous_siblings() 49 #find_previous_sibling() 50 #find_next() 51 #find_previous() 52 #find_all_next() 53 #find_all_previous() 54 55 #>>>css选择器 56 # select 57 soup.select('.class_ .class_1') 58 soup.select('ul li') 59 tar = soup.select('#id .class_') #id=''下的 class=''的标签 60 soup.select('ul')[0] 61 # 获取属性 62 tar[''] 63 tar.attrs[''] 64 # 获取内容 65 tar.get_text()