• 爬虫的简单操作


    今天学习了python爬虫的简单操作。

    1.学会创建文件夹和创建文件:

     1 import os
     2 
     3 def mkdir(path):
     4     if os.path.exists(path):###判断是文件夹否存在,否则有可能会报错
     5         print("The path has already existed .")
     6     else:
     7         os.makedirs(path)###新建文件夹
     8         print("Done .")
     9 
    10 def write(path,str):
    11     with open(path,"w+") as file:###写文件
    12         file.write(str)
    13 
    14 def main():
    15     mkdir("test")
    16     write("test/test.txt","hello world")
    17 
    18 if __name__=="__main__":
    19     main()
    View Code

    2.得到一个网站的源码(如果能够访问):

     1 from bs4 import BeautifulSoup
     2 import requests
     3 
     4 def main():
     5     html=requests.get("https://www.baidu.com")###去找这个网址
     6     html.encoding="utf-8"###中文网址,换个字符集
     7     soup=BeautifulSoup(html.text,"lxml")###美味的汤,就是正则表达式
     8     print(soup.prettify())###将源码格式化(不是删数据)
     9 
    10 if __name__=="__main__":
    11     main()
    View Code

    3.得到一个网站的源码中相应标签的元素(如果能够访问):

     1 import requests
     2 from bs4 import BeautifulSoup
     3 
     4 def write_to_file(content):
     5     with open("save.txt","a",encoding="utf-8") as f:
     6         f.write(content)
     7 
     8 def get_blog_info(url):
     9     html=requests.get(url)
    10     soup=BeautifulSoup(html.text,"lxml")
    11     print(soup.title)###各种各样的元素
    12     print("="*100)
    13     print(type(soup.title))
    14     print("="*100)
    15     print(type(soup.title.string))
    16     print("="*100)
    17     print(soup.title.string)
    18     print("="*100)
    19     print(soup.head)
    20     print("="*100)
    21     print(soup.p)
    22 
    23 def main():
    24     blog_url="https://www.cnblogs.com/sgh1023"
    25     get_blog_info(blog_url)
    26 
    27 if __name__=="__main__":
    28     main()
    View Code

    4.下载一个图片(如果能够访问):

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import os
     4 
     5 tot=0
     6 path="save"
     7 
     8 def mkdir(path):
     9     if os.path.exists(path):
    10         return
    11     else:
    12         os.makedirs(path)
    13 
    14 def save(content):
    15     global tot,path
    16     mkdir(path)
    17     with open(path+"/"+str(tot)+".png","wb+") as file:
    18         file.write(content)
    19         file.close()
    20         tot=tot+1
    21 
    22 def download_image(url):###下图片,不保证一定成功
    23     print("Now downloading...",tot)
    24     response=requests.get(url)
    25     save(response.content)
    26     print("Done !")
    27 
    28 def main():
    29     download_image("https://www.baidu.com/img/pc_1c6e30772d5e4103103bd460913332f9.png")
    30 
    31 if __name__=="__main__":
    32     main()
    View Code

    5.下载一个网页的图片:

     1 import requests
     2 import urllib
     3 import os
     4 from bs4 import BeautifulSoup
     5 
     6 tot=0
     7 path="save"
     8 
     9 def mkdir(path):
    10     if os.path.exists(path):
    11         return
    12     else:
    13         os.makedirs(path)
    14 
    15 def save(content):
    16     global tot,path
    17     mkdir(path)
    18     with open(path+"/"+str(tot)+".png","wb+") as file:
    19         file.write(content)
    20         file.close()
    21         tot=tot+1
    22 ######################################################################
    23 def get_html_content(url):###获得网址的源码
    24     req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。
    25     req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    26     response=urllib.request.urlopen(req)
    27     html=response.read()
    28     return html
    29 
    30 def url_exist(url):###判断网址是否存在
    31     try:
    32         response=requests.get(url)
    33         return True
    34     except:
    35         return False
    36 
    37 def download_image(url):###下图片
    38     print("Now downloading...",tot,url)
    39     if(url_exist(url)):###判断网址是否存在
    40         response=requests.get(url)
    41         save(response.content)
    42         print("Done !")
    43     else:
    44         print("Unavailable !")
    45 ######################################################################
    46 def process(str):###简单地处理网址
    47     if(str[0]=='h'):
    48         return str;
    49     elif(str[0]=='/' and str[1]!='/'):
    50         return "https:/"+str
    51     return "https:"+str;
    52 
    53 def get_image(url):
    54     soup=BeautifulSoup(get_html_content(url),"lxml")
    55     items=soup.find_all("img",{"src":True})
    56     for i in items:
    57         download_image(process(i["src"]))
    58 
    59 def main():
    60     url="https://www.bilibili.com"
    61     get_image(url)
    62 
    63 if __name__=="__main__":
    64     main()
    View Code

     当然,find_all的参数视具体情况而定。

    6.正则表达式下载jpg:

     1 import requests
     2 import urllib
     3 import os
     4 import re
     5 from bs4 import BeautifulSoup
     6 
     7 tot=0
     8 path="save"
     9 
    10 def mkdir(path):
    11     if os.path.exists(path):
    12         return
    13     else:
    14         os.makedirs(path)
    15 
    16 def save(content):
    17     global tot,path
    18     mkdir(path)
    19     with open(path+"/"+str(tot)+".jpg","wb+") as file:
    20         file.write(content)
    21         file.close()
    22         tot=tot+1
    23 ################################################################################
    24 def get_html_content(url):###获得网址的源码
    25     req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。
    26     req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    27     response=urllib.request.urlopen(req)
    28     response.encode="utf-8"
    29     html=response.read()
    30     return html
    31 
    32 def url_exist(url):###判断网址是否存在
    33     try:
    34         response=requests.get(url)
    35         return True
    36     except:
    37         return False
    38 
    39 def download_image(url):###下图片
    40     print("Now downloading...",tot,url)
    41     if(url_exist(url)):###判断网址是否存在
    42         response=requests.get(url)
    43         save(response.content)
    44         print("Done !")
    45     else:
    46         print("Unavailable !")
    47 ################################################################################
    48 def process(str):###简单地处理网址
    49     if(len(str)<=5):
    50         return "NO"
    51     if(str[0]=='h'):
    52         return str;
    53     elif(str[0]=='/' and str[1]=='/'):
    54         return "https:"+str
    55     elif(str[0]=='/' and str[1]!='/'):
    56         return "https:/"+str
    57     return "https://"+str;
    58 
    59 def get_image(url):
    60     S=get_html_content(url)
    61     items=re.findall('https://(.*?)([.]jpg)',str(S),re.S)###正则表达式
    62     for i in items:
    63         download_image(process(i[0]+i[1]))
    64         if(tot==100):
    65             print("Too many images ! Stop what you are doing !")
    66             break
    67 
    68 def main():
    69     url="https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%D1%BC%D7%D3"
    70     get_image(url)
    71 
    72 if __name__=="__main__":
    73     main()
    View Code
  • 相关阅读:
    Tensor:Pytorch神经网络界的Numpy
    你真的懂语音特征吗?
    ES高级(17) 使用基础(5)安装(5) Linux 集群
    ES高级(16) 使用基础(4)安装(4) Linux 单机
    ES高级(15) 使用基础(3)安装(3) Windows 集群
    ES高级(14) 使用基础(2)安装(2) 概念
    ES入门 (13)Java API 操作(4)DQL(1) 请求体查询/term 查询,查询条件为关键字/分页查询/数据排序/过滤字段/Bool 查询/范围查询/模糊查询/高亮查询/聚合查询/分组查询
    ES入门 (12)Java API 操作(3)DML 新增文档/修改文档/查询文档/删除文档/批量操作
    ES入门 (11)Java API 操作(2)DDL 索引操作
    ES入门 (10)Java API 操作(1)准备
  • 原文地址:https://www.cnblogs.com/GreenDuck/p/12287016.html
Copyright © 2020-2023  润新知