• 爬虫的简单操作


    今天学习了python爬虫的简单操作。

    1.学会创建文件夹和创建文件:

     1 import os
     2 
     3 def mkdir(path):
     4     if os.path.exists(path):###判断是文件夹否存在,否则有可能会报错
     5         print("The path has already existed .")
     6     else:
     7         os.makedirs(path)###新建文件夹
     8         print("Done .")
     9 
    10 def write(path,str):
    11     with open(path,"w+") as file:###写文件
    12         file.write(str)
    13 
    14 def main():
    15     mkdir("test")
    16     write("test/test.txt","hello world")
    17 
    18 if __name__=="__main__":
    19     main()
    View Code

    2.得到一个网站的源码(如果能够访问):

     1 from bs4 import BeautifulSoup
     2 import requests
     3 
     4 def main():
     5     html=requests.get("https://www.baidu.com")###去找这个网址
     6     html.encoding="utf-8"###中文网址,换个字符集
     7     soup=BeautifulSoup(html.text,"lxml")###美味的汤,就是正则表达式
     8     print(soup.prettify())###将源码格式化(不是删数据)
     9 
    10 if __name__=="__main__":
    11     main()
    View Code

    3.得到一个网站的源码中相应标签的元素(如果能够访问):

     1 import requests
     2 from bs4 import BeautifulSoup
     3 
     4 def write_to_file(content):
     5     with open("save.txt","a",encoding="utf-8") as f:
     6         f.write(content)
     7 
     8 def get_blog_info(url):
     9     html=requests.get(url)
    10     soup=BeautifulSoup(html.text,"lxml")
    11     print(soup.title)###各种各样的元素
    12     print("="*100)
    13     print(type(soup.title))
    14     print("="*100)
    15     print(type(soup.title.string))
    16     print("="*100)
    17     print(soup.title.string)
    18     print("="*100)
    19     print(soup.head)
    20     print("="*100)
    21     print(soup.p)
    22 
    23 def main():
    24     blog_url="https://www.cnblogs.com/sgh1023"
    25     get_blog_info(blog_url)
    26 
    27 if __name__=="__main__":
    28     main()
    View Code

    4.下载一个图片(如果能够访问):

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import os
     4 
     5 tot=0
     6 path="save"
     7 
     8 def mkdir(path):
     9     if os.path.exists(path):
    10         return
    11     else:
    12         os.makedirs(path)
    13 
    14 def save(content):
    15     global tot,path
    16     mkdir(path)
    17     with open(path+"/"+str(tot)+".png","wb+") as file:
    18         file.write(content)
    19         file.close()
    20         tot=tot+1
    21 
    22 def download_image(url):###下图片,不保证一定成功
    23     print("Now downloading...",tot)
    24     response=requests.get(url)
    25     save(response.content)
    26     print("Done !")
    27 
    28 def main():
    29     download_image("https://www.baidu.com/img/pc_1c6e30772d5e4103103bd460913332f9.png")
    30 
    31 if __name__=="__main__":
    32     main()
    View Code

    5.下载一个网页的图片:

     1 import requests
     2 import urllib
     3 import os
     4 from bs4 import BeautifulSoup
     5 
     6 tot=0
     7 path="save"
     8 
     9 def mkdir(path):
    10     if os.path.exists(path):
    11         return
    12     else:
    13         os.makedirs(path)
    14 
    15 def save(content):
    16     global tot,path
    17     mkdir(path)
    18     with open(path+"/"+str(tot)+".png","wb+") as file:
    19         file.write(content)
    20         file.close()
    21         tot=tot+1
    22 ######################################################################
    23 def get_html_content(url):###获得网址的源码
    24     req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。
    25     req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    26     response=urllib.request.urlopen(req)
    27     html=response.read()
    28     return html
    29 
    30 def url_exist(url):###判断网址是否存在
    31     try:
    32         response=requests.get(url)
    33         return True
    34     except:
    35         return False
    36 
    37 def download_image(url):###下图片
    38     print("Now downloading...",tot,url)
    39     if(url_exist(url)):###判断网址是否存在
    40         response=requests.get(url)
    41         save(response.content)
    42         print("Done !")
    43     else:
    44         print("Unavailable !")
    45 ######################################################################
    46 def process(str):###简单地处理网址
    47     if(str[0]=='h'):
    48         return str;
    49     elif(str[0]=='/' and str[1]!='/'):
    50         return "https:/"+str
    51     return "https:"+str;
    52 
    53 def get_image(url):
    54     soup=BeautifulSoup(get_html_content(url),"lxml")
    55     items=soup.find_all("img",{"src":True})
    56     for i in items:
    57         download_image(process(i["src"]))
    58 
    59 def main():
    60     url="https://www.bilibili.com"
    61     get_image(url)
    62 
    63 if __name__=="__main__":
    64     main()
    View Code

     当然,find_all的参数视具体情况而定。

    6.正则表达式下载jpg:

     1 import requests
     2 import urllib
     3 import os
     4 import re
     5 from bs4 import BeautifulSoup
     6 
     7 tot=0
     8 path="save"
     9 
    10 def mkdir(path):
    11     if os.path.exists(path):
    12         return
    13     else:
    14         os.makedirs(path)
    15 
    16 def save(content):
    17     global tot,path
    18     mkdir(path)
    19     with open(path+"/"+str(tot)+".jpg","wb+") as file:
    20         file.write(content)
    21         file.close()
    22         tot=tot+1
    23 ################################################################################
    24 def get_html_content(url):###获得网址的源码
    25     req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。
    26     req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    27     response=urllib.request.urlopen(req)
    28     response.encode="utf-8"
    29     html=response.read()
    30     return html
    31 
    32 def url_exist(url):###判断网址是否存在
    33     try:
    34         response=requests.get(url)
    35         return True
    36     except:
    37         return False
    38 
    39 def download_image(url):###下图片
    40     print("Now downloading...",tot,url)
    41     if(url_exist(url)):###判断网址是否存在
    42         response=requests.get(url)
    43         save(response.content)
    44         print("Done !")
    45     else:
    46         print("Unavailable !")
    47 ################################################################################
    48 def process(str):###简单地处理网址
    49     if(len(str)<=5):
    50         return "NO"
    51     if(str[0]=='h'):
    52         return str;
    53     elif(str[0]=='/' and str[1]=='/'):
    54         return "https:"+str
    55     elif(str[0]=='/' and str[1]!='/'):
    56         return "https:/"+str
    57     return "https://"+str;
    58 
    59 def get_image(url):
    60     S=get_html_content(url)
    61     items=re.findall('https://(.*?)([.]jpg)',str(S),re.S)###正则表达式
    62     for i in items:
    63         download_image(process(i[0]+i[1]))
    64         if(tot==100):
    65             print("Too many images ! Stop what you are doing !")
    66             break
    67 
    68 def main():
    69     url="https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%D1%BC%D7%D3"
    70     get_image(url)
    71 
    72 if __name__=="__main__":
    73     main()
    View Code
  • 相关阅读:
    Why Choose Jetty?
    Jetty 的工作原理以及与 Tomcat 的比较
    Tomcat设计模式
    Servlet 工作原理解析
    Tomcat 系统架构
    spring boot 打包方式 spring boot 整合mybaits REST services
    wireshark udp 序列号 User Datagram Protocol UDP
    Maven 的聚合(多模块)和 Parent 继承
    缓存策略 半自动化就是mybaitis只支持数据库查出的数据映射到pojo类上,而实体到数据库的映射需要自己编写sql语句实现,相较于hibernate这种完全自动化的框架我更喜欢mybatis
    Mybatis解决sql中like通配符模糊匹配 构造方法覆盖 mybits 增删改
  • 原文地址:https://www.cnblogs.com/GreenDuck/p/12287016.html
Copyright © 2020-2023  润新知