• python爬虫基础(requests、BeautifulSoup)


    • request模块

    1.发送请求

     1 #get请求
     2 requests.get(url='xxx',     #地址
     3              params={},      #传入URL的参数
     4              headers={},     #请求头
     5              cookies={}      #cookie
     6              )               #相当于requests.request(method='get',url='xxx')
     7 #post请求
     8 requests.post(url='xxx',
     9              params={},
    10              headers={},
    11              data={},        #数据
    12              json={},        #传json格式数据
    13              cookies={}
    14              )              #相当于requests.request(method='post',url='xxx')
    1 response = requests.get('http://www.baidu.com')
    2 print(response.status_code)  # 打印状态码
    3 print(response.url)          # 打印请求url
    4 print(response.headers)      # 打印头信息
    5 print(response.cookies)      # 打印cookie信息
    6 print(response.text)  #以文本形式打印网页源码
    7 print(response.content) #以字节流形式打印
    8 response.encoding = response.apparent_encoding      #apparent_encoding检测字符编码,encoding设置字符编码
    View Code

    2.其他参数

     1 url
     2 params
     3 data        #传入字典
     4 json        #传入字典
     5 headers
     6 cookies         #传入字典
     7 files          #上传文件
     8 auth        #基本认证 
     9 timeout        #超时时间    
    10 allow_redirects    #是否允许重定向,默认为true
    11 proxies       #使用代理IP
    12 verify       
    13 stream
    14 cert

     文档:http://docs.python-requests.org/zh_CN/latest/user/quickstart.html

    BeautifulSoup

    1. name,标签名称
    # tag = soup.find('a')
    # name = tag.name # 获取
    # tag.name = 'span' # 设置
    
    2. attr,标签属性
    # tag = soup.find('a')
    # attrs = tag.attrs    # 获取
    # tag.attrs = {'ik':123} # 设置
    # tag.attrs['id'] = 'iiiii' # 设置
    
    3. children,所有子标签
    # body = soup.find('body')
    # v = body.children
    
    4. children,所有子子孙孙标签
    # body = soup.find('body')
    # v = body.descendants
    
    5. clear,将标签的所有子标签全部清空(保留标签名)
    # tag = soup.find('body')
    # tag.clear()
    
    6. decompose,递归的删除所有的标签
    # body = soup.find('body')
    # body.decompose()
    
    7. extract,递归的删除所有的标签,并获取删除的标签
    # body = soup.find('body')
    # v = body.extract()
    
    8. decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
    # body = soup.find('body')
    # v = body.decode()
    # v = body.decode_contents()
    
    9. encode,转换为字节(含当前标签);encode_contents(不含当前标签)
    # body = soup.find('body')
    # v = body.encode()
    # v = body.encode_contents()
    
    10. find,获取匹配的第一个标签
    # tag = soup.find('a')
    # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
    # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
    
    11. find_all,获取匹配的所有标签
    # tags = soup.find_all('a')
     
    # tags = soup.find_all('a',limit=1)
     
    # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
    
    # v = soup.find_all(name=['a','div'])
    
    # v = soup.find_all(class_=['sister0', 'sister'])
     
    # v = soup.find_all(text=['Tillie'])
     
    # v = soup.find_all(id=['link1','link2'])
     
    # v = soup.find_all(href=['link1','link2'])
     
    # ####### 正则 #######
    import re
    # rep = re.compile('^p')
    # v = soup.find_all(name=rep)
     
    # rep = re.compile('sister.*')
    # v = soup.find_all(class_=rep)
     
    # rep = re.compile('http://www.oldboy.com/static/.*')
    # v = soup.find_all(href=rep)
     
    # ####### 方法筛选 #######
    # def func(tag):
    # return tag.has_attr('class') and tag.has_attr('id')
    # v = soup.find_all(name=func)
     
    # ## get,获取标签属性
    # tag = soup.find('a')
    # v = tag.get('id')
    
    12. has_attr,检查标签是否具有该属性
    # tag = soup.find('a')
    # v = tag.has_attr('id')
    
    13. get_text,获取标签内部文本内容
    # tag = soup.find('a')
    # v = tag.get_text('id')
    
    14. index,检查标签在某标签中的索引位置
    # tag = soup.find('body')
    # v = tag.index(tag.find('div'))
     
    # tag = soup.find('body')
    # for i,v in enumerate(tag):
        # print(i,v)
        
    15. is_empty_element,是否是空标签(是否可以是空)或者自闭合标签,
    
    判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
    # tag = soup.find('br')
    # v = tag.is_empty_element
    # print(v)
    
    16. 当前的关联标签
    # soup.next
    # soup.next_element
    # soup.next_elements
    # soup.next_sibling
    # soup.next_siblings
    # tag.previous
    # tag.previous_element
    # tag.previous_elements
    # tag.previous_sibling
    # tag.previous_siblings
    # tag.parent
    # tag.parents
    
    17. 查找某标签的关联标签
    # tag.find_next(...)
    # tag.find_all_next(...)
    # tag.find_next_sibling(...)
    # tag.find_next_siblings(...)
    # tag.find_previous(...)
    # tag.find_all_previous(...)
    # tag.find_previous_sibling(...)
    # tag.find_previous_siblings(...)
    # tag.find_parent(...)
    # tag.find_parents(...)
    # 参数同find_all
    
    18. select,select_one, CSS选择器
    soup.select("title")
    soup.select("p nth-of-type(3)")
    soup.select("body a")
    soup.select("html head title")
    soup.select("span,a")
    soup.select("head > title")
    soup.select("p > a")
    soup.select("p > a:nth-of-type(2)")
    soup.select("p > #link1")
    soup.select("body > a")
    soup.select("#link1 ~ .sister")
    soup.select("#link1 + .sister")
    soup.select(".sister")
    soup.select("[class~=sister]")
    soup.select("#link1")
    soup.select("a#link2")
    soup.select('a[href]')
    soup.select('a[href="http://example.com/elsie"]')
    soup.select('a[href^="http://example.com/"]')
    soup.select('a[href$="tillie"]')
    soup.select('a[href*=".com/el"]')
     
    19. 标签的内容
    # tag = soup.find('span')
    # print(tag.string)          # 获取
    # tag.string = 'new content' # 设置
     
    # tag = soup.find('body')
    # v = tag.stripped_strings  # 递归内部获取所有标签的文本
    
    20.append在当前标签内部追加一个标签
    # tag = soup.find('body')
    # tag.append(soup.find('a'))
    # print(soup)
    #
    # from bs4.element import Tag
    # obj = Tag(name='i',attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.append(obj)
    # print(soup)
    
    21.insert在当前标签内部指定位置插入一个标签
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.insert(2, obj)
    # print(soup)
    
    22. insert_after,insert_before 在当前标签后面或前面插入
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # # tag.insert_before(obj)
    # tag.insert_after(obj)
    # print(soup)
    
    23. replace_with 在当前标签替换为指定标签
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('div')
    # tag.replace_with(obj)
    # print(soup)
    
    24. 创建标签之间的关系
    # tag = soup.find('div')
    # a = soup.find('a')
    # tag.setup(previous_sibling=a)
    # print(tag.previous_sibling)
    
    25. wrap,将指定标签把当前标签包裹起来
    # from bs4.element import Tag
    # obj1 = Tag(name='div', attrs={'id': 'it'})
    # obj1.string = '我是一个新来的'
    #
    # tag = soup.find('a')
    # v = tag.wrap(obj1)
    # print(soup)
     
    # tag = soup.find('a')
    # v = tag.wrap(soup.find('p'))
    # print(soup)
    
    26. unwrap,去掉当前标签,将保留其包裹的标签
    # tag = soup.find('a')
    # v = tag.unwrap()
    # print(soup)
    View Code

     • 抽屉网实现批量点赞

     1 from  bs4 import BeautifulSoup
     2 import requests
     3 
     4 
     5 headers={
     6         'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4882.400 QQBrowser/9.7.13059.400',
     7     }
     8 
     9 
    10 resoponse_1 = requests.get(url='https://dig.chouti.com/',
    11                            headers=headers
    12                            )
    13 cookie_dict = resoponse_1.cookies.get_dict()
    14 
    15 resoponse_2 = requests.post(
    16     url='https://dig.chouti.com/login',
    17     data={
    18         'phone':'8615733239039',
    19         'password':'xxxxxx',
    20         'oneMonth':'1',
    21     },
    22     headers=headers,
    23     cookies=cookie_dict
    24 )
    25 for page in range(1,3):
    26     html = requests.get(url='https://dig.chouti.com/all/hot/recent/{}'.format(page),headers=headers)
    27     soup = BeautifulSoup(html.text,'html.parser')
    28     divs = soup.find(name='div',id='content-list')
    29     items = divs.find_all(attrs={'class':'item'})
    30     for i in items:
    31         click_id = i.find('img').get('lang')
    32         if click_id:
    33             print(click_id)
    34             click_hand = requests.post(url='https://dig.chouti.com/link/vote?linksId={}'.format(click_id),
    35                                        headers=headers,
    36                                        cookies=cookie_dict,
    37                                        )
    View Code

      参考链接:http://www.cnblogs.com/wupeiqi/articles/6283017.html

  • 相关阅读:
    jq 字符串去除空格
    wpf 加载资源文件
    wpf 寻找TreeView的子元素,并对其进行操作
    IIS发布MVC ASP.NET网站
    wpf Binding 小记录
    asp.net mvc表单异步提交
    把路径设置为全局变量
    MVC将服务器端的物理路径转换为服务器路径
    silverlight控件阴影效果示例
    NLP的12条前提假设
  • 原文地址:https://www.cnblogs.com/wangyajian/p/9156242.html
Copyright © 2020-2023  润新知