• 爬虫基础


     1 import  requests
     2 #无论是post、get请求
     3 #要注意防爬虫策略:一般是加个请求头
     4 #登陆
     5 
     6 #下面的过程无法完成点赞
     7 import  requests
     8 #无论是post、get请求
     9 #要注意防爬虫策略:一般是加个请求头
    10 #登陆
    11 response_login = requests.post(
    12     url = 'https://dig.chouti.com/login',
    13     data = {
    14         'phone':'8613125397685',
    15         'password':'478324asd',
    16         'oneMonth':'1'
    17     },
    18     #加个请求头就可以爬取了
    19     headers = {
    20         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    21     }
    22 )
    23 cookies_dict = response_login.cookies.get_dict()#就是一个字典
    24 #print(cookies_dict)打印cookies信息
    25 #点赞
    26 r1 = requests.get(
    27     url = '',#只有url的话可能会被拦截,要加个请求头
    28     headers = {
    29         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    30     },
    31     cookies = cookies_dict
    32 )
    33 print(r1.text)
    #可以正确点赞
    import  requests
    #2,3步都是post
    #1:访问抽屉新热榜,获取cookie(未授权)
    r1 = requests.get(
        url = 'https://dig.chouti.com/all/hot/recent/1',#只有url的话可能会被拦截,要加个请求头
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        },
    )
    r1_cookie_dict = r1.cookies.get_dict()
    #2 :发送用户名和密码认证  + cookie(未授权)
    #:注意用爬虫策略
    response_login = requests.post(
        url = 'https://dig.chouti.com/login',
        data = {
            'phone':'8613125397685',
            'password':'478324asd',
            'oneMonth':'1'
        },
        #加个请求头就可以爬取了
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        },
        cookies = r1_cookie_dict
    )
    #3:点赞
    r1 = requests.post(
        url = 'https://dig.chouti.com/link/vote?linksId=22900531',#只有url的话可能会被拦截,要加个请求头
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        },
        cookies = r1_cookie_dict#授权了
    )
    print(r1.text)
    使用多行注释:
    
    1.选中多行
    
    2.按快捷键Ctrl + /
    
    
    
    去掉多行注释:
    
    1.选中被注释的多行内容
    
    2.按快捷键Ctrl + /
    
    
     1 在使用pycharm时,经常会需要多行代码同时缩进、左移,pycharm提供了快捷方式
     2 
     3 1、pycharm使多行代码同时缩进
     4 
     5    鼠标选中多行代码后,按下Tab键,一次缩进四个字符
     6 
     7 2、pycharm使多行代码同时左移
     8 
     9   鼠标选中多行代码后,同时按住shift+Tab键,一次左移四个字符
    10 --------------------- 
    
    
    #左边出现bokmark的标记时可以用F11键取消
    import requests#伪造浏览器向某个地址发请求
    from bs4 import BeautifulSoup#解析HTML格式的字符串
    #1:下载页面
    ret = requests.get(url='https://www.autohome.com.cn/news/')
    #print(ret)  ret 是一个对象
    #print(ret.content)原始字节
    #print(ret.text)字符串,但是会出现乱码
    #ret.encoding = 'gbk'可以自己设置编码格式
    #print(ret.text)若格式正确,则不会出现乱码了
    #print(ret.apparent_encoding)返回网页的编码
    ret.encoding = ret.apparent_encoding#直接设置为你网页的编码
    #print(ret.text)
    #2:获取想要的指定内容Beautifulsuop
    soup = BeautifulSoup(ret.text,'html.parser')#解析器,parser与'之间不能有空格
    #print(type(soup))#soup为对象
    #div = soup.find(name = 'div',id = 'focus-1')
    div = soup.find(name = 'div',attrs={'id':'focus-1','class':'focusimg focusimg02'})
    print(div)
    li_list = div.find_all('li')#列表
    #print(li_list)
    for  li  in  li_list:
        h2 = li.find('h2')
        a = li.find('a')
        p = li.find('p')#默认第一个参数为name
    
        img = li.find('img')
        src = img.get('src')
        file_name = src.rsplit('__',maxsplit=1)[1]
        ret_img = requests.get(url='https:'+src)
        with open(file_name,'wb') as f:
            f.write(ret_img.content)
        print(h2.text, a.get('href'))  # href在搜索栏会自动添加http/https
        print(p.text)
        print('=' * 15)
        #print(a.attrs)a的所有属性
        #print(a.get('href'))获取a的某一个属性
        #print(h2)
        #print(h2.text)
        #print(a.text)
        #p = li.find('p')#默认第一个参数为name
        #print(p.text)
    1 #加快下载
    2 pip install requests -i https://pypi.douban.com/simple
    3 忘了的话 百度 python 豆瓣下载源
    4 使用douban源下载python包 - 中国陆特 - 博客园
    5 里面有
     1 #对多页进行点赞
     2 import  requests
     3 from  bs4 import  BeautifulSoup
     4 for  page_num in  range(8,9):
     5     r1 = requests.get(
     6         url = 'https://dig.chouti.com/all/hot/recent/%s'%page_num,
     7         headers = {
     8        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
     9         }
    10     )
    11     #print(r1.text)
    12     r1_cookie_dict = r1.cookies.get_dict()
    13     response_login  =  requests.post(
    14         url = 'https://dig.chouti.com/login',
    15         data ={
    16         'phone': '8613125397685',
    17         'password': '478324asd',
    18         'oneMonth': '1'
    19         },
    20         headers = {
    21                 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    22             },
    23         cookies = r1_cookie_dict
    24     )
    25     # response_index = requests.get(
    26     #     url = 'https://dig.chouti.com/',
    27     #     headers = {
    28     #    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    29     #     }
    30     # )
    31     soup  = BeautifulSoup(r1.text,'html.parser')#r1.text是对的,不是response_index
    32     div = soup.find(attrs={'id':'content-list'})
    33     items = div.find_all(attrs={'class':'item'})
    34     for item in  items:
    35         tag = item.find(attrs={'class':'part2'})
    36         nid = tag.get('share-linkid')
    37         print(nid)
    38         r1 = requests.post(#r1前面出现了也没问题
    39             url = 'https://dig.chouti.com/link/vote?linksId=%s'%nid,
    40             headers={
    41                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    42             },
    43             cookies=r1_cookie_dict
    44         )
    45         print(r1.text)

     1 import  requests
     2 from   bs4   import BeautifulSoup
     3 #找到value
     4 r1 = requests.get(
     5     url = 'https://github.com/login'
     6 )
     7 s1 = BeautifulSoup(r1.text,'html.parser')
     8 token = s1.find(name = 'input',attrs={'name':'authenticity_token'}).get('value')
     9 #print(r1.text)
    10 #print(token)
    11 r1_cookie_dict = r1.cookies.get_dict()
    12 r2 = requests.post(
    13     url = 'https://github.com/session',
    14     data = {
    15         'commit':'Sign in',
    16          'utf8':'',
    17          'authenticity_token':token,
    18          'login':'clttyou',
    19          'password':'9430'
    20     },
    21     cookies = r1_cookie_dict
    22 )
    23 print(r1.text)
     1 #爬主页的模板
     2 import  requests
     3 def  getHtml(url):
     4     try:
     5         r = requests.get(url,timeout = 30)
     6         r.raise_for_status()
     7         r.encoding = r.apparent_encoding
     8         return r.text
     9     except:
    10         return "产生异常"
    11 if __name__  == "__main__":
    12     url = "https://www.taobao.com/"
    13     print(getHtml(url))
    str = "00000003210Runoob01230000000";
    print(str.strip('0'));  # 去除首尾字符 0
    str2 = "   Runoob      ";  # 去除首尾空格
    print(str2.strip());
    
    '''
    3210Runoob0123
    Runoob
    '''
  • 相关阅读:
    Sublime Text 3
    JobTracker等相关功能模块初始化
    .NET编程规范
    理解多线程设计模式(转)
    理解java中的ThreadLocal 专题
    情商--人生职场
    老师只喜欢好学生(转)
    不是因为项目让你不能发光,而是因为你才让项目不能发光
    考试系统--前进/后退功能
    tomcat配置文件server.xml具体解释
  • 原文地址:https://www.cnblogs.com/tingtin/p/9866309.html
Copyright © 2020-2023  润新知