• 爬虫bs4


    import requests
    # res = requests.get('http://httpbin.org/get')
    # res1 = res.json()

    #转换成json数据
    # import json
    # res1=json.loads(response.text) #太麻烦

    #什么SSL,就是安全认证,就是http后面的那个s

    # respone=requests.get('https://www.12306.cn',
    # cert=('/path/server.crt',
    # '/path/key'))
    # print(respone.status_code)

    #正向代理
    #通过别人的服务器去访问你访问的地址
    #ip 代理收费(通过代理访问自己的服务,在服务端取出客户端ip查看一下)
    # proxies={
    # # 'http':'http://egon:123@localhost:9743',#带用户名密码的代理,@符号前是用户名与密码
    # # 'http':'http://localhost:9743',
    # 'https':'https://localhost:9743',
    # 'http':'http://124.205.155.148:9090'
    # }
    # respone=requests.get('https://www.12306.cn',
    # proxies=proxies)
    #
    # print(respone.status_code)
    #超时设置
    # import requests
    # respone=requests.get('https://www.baidu.com',
    # timeout=0.0001)

    #上传文件
    import requests
    files={'file':open('a.jpg','rb')}
    respone=requests.post('http://httpbin.org/post',files=files)
    print(respone.status_code)

    from bs4 import BeautifulSoup
    #可以将html页面数据转换成一个对象
    '''
    里面的两个方法,一个是find 还有一个是find_all
    find:
    -name="标签名" 标签
    -id,class_,="" 把这个标签拿出来
    -标签.text 取标签的内容
    -标签.get(属性名) 取标签属性的内容
    find_all

    '''
    url='https://www.autohome.com.cn/news/1/#liststart'
    res = requests.get(url)
    soup = BeautifulSoup(res.text,"lxml")
    div = soup.find(id = "auto-channel-lazyload-article")
    ul=div.find(name='ul') #只找第一个ul标签
    # ul_list=div.find_all(class_="article") #找出下面所有类名为article的标签
    # print(len(ul_list))
    li_list=ul.find_all(name='li')
    # print(len(li_list))
    for li in li_list:
    h3=li.find(name='h3')
    if h3:
    title=h3.text #把h3标签的text取出来
    print(title)
    a=li.find(name='a')
    if a:
    article_url=a.get('href') #取出a标签的href属性
    print(article_url)

    img=li.find(name='img')
    if img:
    img_url=img.get('src')
    print(img_url)
    p=li.find(name='p')
    if p:
    content=p.text
    print(content)

    #得到的数据永远都可以find,


    from bs4 import BeautifulSoup
    html_doc = """
    <html><head><title>The Dormouse's story</title></head>
    <body>

    <p class="title" id="bbaa"><b name="xx" age="18">The Dormouse's story</b><b>xxxx</b></p>
    <p class="xxx" a="xxx">asdfasdf</p>

    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>

    <p class="story">...</p>
    """

    # soup=BeautifulSoup(html_doc,'lxml')
    # ress=soup.prettify() #美化一下
    # soup=BeautifulSoup(ress,'lxml')
    # print(ress)

    #遍历文档树
    # print(soup.p.name)
    # print(soup.p.attrs)
    # print(soup.p.string)
    # print(list(soup.p.strings))
    # print(soup.p.text)

    # print(soup.body.p.text)
    # print(soup.body.p.contents)
    # print(list(soup.body.p.children))
    # print(list(soup.body.p.descendants))
    # print(soup.body.p.parent)
    # print(list(soup.body.p.parents))
    # print(len(list(soup.body.p.parents)))
    # print(soup.body.p.previous_sibling)
    # print(soup.body.p.previous_sibling)
    # print(soup.find(class_="xxx").previous_sibling)
    # print(soup.a.next_sibling)
    # print(soup.a.previous_sibling)
    # print(type(soup.p))



    #查找文档
    #五种过滤器 :字符串,正则,布尔,方法,列表
    import re
    # print(soup.find_all(name='b'))


    # print(soup.find_all(name=re.compile('^b')))
    # print(soup.find_all(id=re.compile('^b')))


    # print(soup.find_all(name=['a','b']))
    # print(soup.find_all(name=True))

    # def has_class_but_no_id(tag):
    # return tag.has_attr('class') and not tag.has_attr('id')
    # print(soup.find_all(name=has_class_but_no_id))

    #css选择
    # xpath
    # print(soup.select(".title"))
    # print(soup.select("#bbaa"))

    # print(soup.select('#bbaa b')[0].attrs.get('name'))

    #recursive=False 只找同一层
    #limit 找到第几个之后停止

    sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>",'lxml')
    print(sibling_soup.b.next_sibling)
    print(sibling_soup.c.previous_sibling )



    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    import time

    # from selenium.webdriver.chrome.options import Options
    # chrome_options = Options()
    # chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
    # chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
    # chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
    # chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
    # chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
    # chrome_options.binary_location = r"C:Program Files (x86)GoogleChromeApplicationchrome.exe" #手动指定使
    # bro=webdriver.PhantomJS()

    # bro=webdriver.Chrome(chrome_options=chrome_options)
    bro=webdriver.Chrome()
    bro.get('https://www.baidu.com')

    # print(bro.page_source)
    # time.sleep(3)
    time.sleep(1)
    #取到输入框
    inp=bro.find_element_by_id('kw')
    #往框里写字
    inp.send_keys("美女")
    inp.send_keys(Keys.ENTER) #输入回车
    #另一种方式,取出按钮,点击su
    time.sleep(3)
    bro.close()
  • 相关阅读:
    WPF后台生成datatemplate(TreeViewItem例子)
    后台根据数据模版内的子控件获取使用该模版的控件
    逻辑代码实现拼音首字母检索
    自定义LISTBOX内子项为checkbox或者radio时,关于IsChecked绑定
    siliverlight某些事件无法响应
    页面内容不能铺满浏览器窗口的解决方法
    linux sort命令学习
    linux find命令学习
    linux tr命令学习
    linux cat命令学习
  • 原文地址:https://www.cnblogs.com/yangxinpython/p/11938124.html
Copyright © 2020-2023  润新知