• 爬虫实战


    爬取拉勾网职位信息

    参考博客:https://www.cnblogs.com/wsmrzx/p/10993640.html

    #https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false
    import requests
    #实际要爬取的url
    url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
    
    payload = {
        'first': 'true',
        'pn': '1',
        'kd': 'python',
    }
    
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
        'Accept': 'application/json, text/javascript, */*; q=0.01'
    }
    #原始的url
    urls ='https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
    #建立session
    s = requests.Session()
    # 获取搜索页的cookies
    s.get(urls, headers=header, timeout=3)
    # 为此次获取的cookies
    cookie = s.cookies
    # 获取此次文本
    response = s.post(url, data=payload, headers=header, cookies=cookie, timeout=5).text
    print(response)

    爬取红楼梦小说

    #http://www.shicimingju.com/book/hongloumeng.html
    
    import requests
    
    from bs4 import BeautifulSoup
    ret=requests.get('https://www.shicimingju.com/book/hongloumeng.html')
    # print(ret.text)
    
    soup=BeautifulSoup(ret.text,'lxml')
    li_list=soup.find(class_='book-mulu').find('ul').find_all('li')
    with open('hlm.txt','w',encoding='utf-8') as f:
        for li in li_list:
            content=li.find('a').text
            url='https://www.shicimingju.com'+li.find('a').get('href')
    
            f.write(content)
            f.write('
    ')
            res_content=requests.get(url)
            soup2=BeautifulSoup(res_content.text,'lxml')
            content_detail=soup2.find(class_='chapter_content').text
            f.write(content_detail)
            f.write('
    ')
            print(content,'写入了')

    爬取肯德基门店

    # http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
    import requests
    
    header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
    }
    data = {
        'cname': '',
        'pid': 20,
        'keyword': '浦东',
        'pageIndex': 1,
        'pageSize': 10
    }
    ret = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data=data, headers=header)
    print(ret.json())

    爬取糗事百科段子

    #https://www.qiushibaike.com/text/page/2/
    import requests
    from bs4 import BeautifulSoup
    ret=requests.get('https://www.qiushibaike.com/text/page/2/')
    # print(ret.text)
    
    soup=BeautifulSoup(ret.text,'html.parser')
    
    article_list=soup.find_all(class_='article')
    # print(article_list)
    for article in article_list:
        content=article.find(class_='content').text
        print(content)
        print('-------')

    爬取京东商品信息

    from selenium import webdriver
    import time
    # 模拟键盘输入
    from selenium.webdriver.common.keys import Keys
    bro=webdriver.Chrome(executable_path='./chromedriver.exe')
    # 设置隐士等待
    bro.implicitly_wait(10)
    
    def get_goods_info(bro):
        # li_list=bro.find_element_by_class_name('gl-warp').find_elements_by_tag_name('li')
        # goods=bro.find_elements_by_class_name('gl-item')
        goods = bro.find_elements_by_css_selector('.gl-item')
        # print(len(goods))
        for good in goods:
            try:
                price = good.find_element_by_css_selector('.p-price i').text
                name = good.find_element_by_css_selector('.p-name em').text
                url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
                commits = good.find_element_by_css_selector('.p-commit strong>a').text
                photo_url = good.find_element_by_css_selector('.p-img img').get_attribute('src')
    
                print('''
                商品名字:%s
                商品价格:%s
                商品地址:%s
                商品评论数:%s
                商品图片地址:%s
        
                ''' % (name, price, url, commits, photo_url))
            except Exception as e:
                continue
    
        next_button = bro.find_element_by_partial_link_text('下一页')
        time.sleep(1)
        next_button.click()
    
        get_goods_info(bro)
    
    try:
        bro.get('https://www.jd.com/')
    
        input_k=bro.find_element_by_id('key')
    
        input_k.send_keys('奶牛')
        # 模拟键盘的回车键
        input_k.send_keys(Keys.ENTER)
        get_goods_info(bro)
    
    
    except Exception as e:
        print(e)
    
    finally:
        bro.close()

    自动登录12306

    from selenium import webdriver
    import time
    #pillow
    from PIL import Image
    
    # 引入超级鹰
    
    from chaojiying import Chaojiying_Client
    
    
    from selenium.webdriver import ActionChains
    bro=webdriver.Chrome(executable_path='./chromedriver.exe')
    bro.implicitly_wait(10)
    try:
        bro.get('https://kyfw.12306.cn/otn/resources/login.html')
        bro.maximize_window()  # 窗口最大化,全屏
        button_z=bro.find_element_by_css_selector('.login-hd-account a')
        button_z.click()
        time.sleep(2)
        # 截取整个屏幕
        bro.save_screenshot('./main.png')
        # 验证码的位置和大小
        img_t=bro.find_element_by_id('J-loginImg')
        print(img_t.size)
        print(img_t.location)
    
        size=img_t.size
        location=img_t.location
    
        img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
        # # 抠出验证码
        # #打开
        img = Image.open('./main.png')
        # 抠图
        fram = img.crop(img_tu)
        # 截出来的小图
        fram.save('code.png')
    
        # 调用超级鹰破解
        chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641')    #用户中心>>软件ID 生成一个替换 96001
        im = open('code.png', 'rb').read()                                                    #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
        # print(chaojiying.PostPic(im, 9004))
    
        ## 返回结果如果有多个 260,133|123,233,处理这种格式[[260,133],[123,233]]
        res=chaojiying.PostPic(im, 9004)
        print(res)
        result=res['pic_str']
    
        all_list = []
        if '|' in result:
            list_1 = result.split('|')
            count_1 = len(list_1)
            for i in range(count_1):
                xy_list = []
                x = int(list_1[i].split(',')[0])
                y = int(list_1[i].split(',')[1])
                xy_list.append(x)
                xy_list.append(y)
                all_list.append(xy_list)
        else:
            x = int(result.split(',')[0])
            y = int(result.split(',')[1])
            xy_list = []
            xy_list.append(x)
            xy_list.append(y)
            all_list.append(xy_list)
        print(all_list)
        # 用动作链,点击图片
        # [[260,133],[123,233]]
        for a in all_list:
            x = a[0]
            y = a[1]
            ActionChains(bro).move_to_element_with_offset(img_t, x, y).click().perform()
            time.sleep(1)
    
        username=bro.find_element_by_id('J-userName')
        username.send_keys('306334678')
        password=bro.find_element_by_id('J-password')
        password.send_keys('lqz12345')
        time.sleep(3)
        submit_login=bro.find_element_by_id('J-login')
        submit_login.click()
        time.sleep(3)
    
        print(bro.get_cookies())
        time.sleep(10)
        bro.get('https://www.12306.cn/index/')
        time.sleep(5)
    
    except Exception as e:
        print(e)
    finally:
        bro.close()
  • 相关阅读:
    java--静态变量
    java--IO
    java--集合
    java--线程
    java——图形用户界面编程-——布局
    java--图形用户界面编程
    String类
    JAVA-继承和接口
    JAVA——构造方法
    JAVA——接口
  • 原文地址:https://www.cnblogs.com/ZhZhang12138/p/14885673.html
Copyright © 2020-2023  润新知