• python实现简单得google搜索


    import requests
    from bs4 import BeautifulSoup
    from fake_useragent import UserAgent
    
    
    class GoogleSpider:
        def __init__(self, **kwargs):
            self.keyword = kwargs.get("keyword")
    
        def __del__(self):
            pass
    
        def search(self, **kwargs) -> list:
            data = []
            if kwargs.get("keyword") is None:
                if self.keyword is None:
                    return []
                else:
                    query = self.keyword
            else:
                query = kwargs.get("keyword")
            query = query.replace(' ', '+')
            # URL = f"http://google.com/search?q={query}"
            page = 0
            while True:
                # URL = f"https://www.google.com.hk/search?q={query}&newwindow=1&ei=l51XYufsEJX09APssZboDg&start={page * 10}&sa=N&ved=2ahUKEwinlJbD1pL3AhUVOn0KHeyYBe0Q8tMDegQIAhA1&biw=1536&bih=370&dpr=1.25"
                URL="https://www.google.com.hk/search?q={query}&newwindow=1&ei=pbdXYtL9FNW-0PEPv96DiA0&start={page * 10}&sa=N&ved=2ahUKEwiS5Nqv75L3AhVVHzQIHT_vANEQ8tMDegQIARA1&biw=1536&bih=396&dpr=1.25"
                try:
                    print("当前正在搜索【" + str(query) + "】,当前第" + str(page) + "页...")
                    USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
                    headers = {
                        # "user-agent": USER_AGENT,
                        'User-Agent': str(UserAgent(path="ua.json").random),
                        "cookie": "CONSENT=YES+srp.gws-20211208-0-RC2.zh-CN+FX+870; "
                                  "AEC=AVQQ_LBBv2AdMIJg5Mo-mhbpPvz7Yy6TXL2YDpPEIWPZ2V12AZNvVRj01w; 1P_JAR=2022-04-14-04; "
                                  "NID=511"
                                  "=WG_TSuY8P75PO_IIAjeBJh4D9Z1peKXWPh22PDAN62GWAajB5gIj6tvOQRCjHX5g9PEJPyM2RDB_ZlT5qS3lSXhUpOA1U9KkBkt3UbLM6uoHIZubQoHzZMzstsr_e_8eMDo9LPs18nMvIJf-4C6F_XC6TvZCYmgER4Dt2YzXRu6DhCoDljBI46qarDZiCTFDKvy2PNp_hzrGTfOUqg; DV=I3h3GDVGyQsvcNiZldgA7vxYqO5jAlg4dyRxmh2zaAEAAAA ",
                    }
    
                    resp = requests.get(URL, headers=headers, verify=True)
                    # print(resp.content)
                    f = open("1.html", "wb+")
                    f.write(resp.content)
                    f.close()
                    if resp.status_code == 200:
                        soup = BeautifulSoup(resp.content, "html.parser")
                        # print(soup.prettify())
                        li_arr = soup.select("div[class='yuRUbf']")
                        if len(li_arr):
                            print(len(li_arr))
                            # arr = []
                            for key in li_arr:
                                li_a = key.select("a")
                                a_href = li_a[0].attrs["href"]
                                li_h3 = li_a[0].select("h3")
                                _title = li_h3[0].text.strip().strip("\n").strip().replace("\n", "").replace(" ", "")
                                print(_title)
                                print(a_href)
                                obj = {"company": query, "title": _title, "url": a_href}
                                print(obj)
                         
                                # arr.append(obj)
                            page += 1
                        else:
                            break
                except Exception as e:
                    print(e)
                    break
            return data
    
    
    if __name__ == "__main__":
        gs = GoogleSpider()
        keyword = "python"
        data = gs.search(keyword=keyword)
  • 相关阅读:
    mysql 分列或取子串
    Excel “20200504”文本格式转化为时间格式
    Mysql清空数据表
    python 做词云图
    Pandas操作excel
    python中zip()函数的用法
    Excel技能提升
    JS 学习笔记
    元类理解与元类编程 《Python3网络爬虫开发》中第九章代理的使用代码Crawler中代码的理解
    关于选择器注意的点
  • 原文地址:https://www.cnblogs.com/lxz123/p/16147973.html
Copyright © 2020-2023  润新知