• Day92


    # session:用于保存客户端历史访问的信息
    # BeautifulSoup是一个模块,该模块用于接收一个HTML或XML字符串,然后将其进行格式化,
    # 之后遍可以使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单。
    # 解析器 使用方法 优势 劣势
    # Python标准库 BeautifulSoup(markup,"html.parser") 容错能力强、内置标准库、速度一般 低版本容错能力差
    # lxml HTML解析器 BeautifulSoup(markup,"lxml") 容错能力强、速度快 需要C语言库
    # pip install lxml
    # 学习官方文档,以后想要看标签或者其他的方法、类,需要通过看源码,找到类、父类、基类
    # 1.name名称
    # tag = soup.find('a')
    # name = tag.name # 获取
    # print(name)
    # tag.name = 'span' # 设置
    # print(soup)
    # 2.attr标签属性(获取的是字典类型)
    # tag = soup.find('a')#>>找到a标签
    # attrs = tag.attrs # 获取所有属性
    # print(attrs)
    # tag.attrs = {'ik':123} # 设置属性的新值
    # tag.attrs['id'] = 'iiiii' # 设置
    # print(soup)
    # 3. children,所有子标签
    # body = soup.find('body')
    # v = body.children
    # children拿到的对象的type类型如果是NavigableString则是字符串,如果是Tag说明是标签
    # 需要通过判断的方式来确定筛选标签
    # 4. children,所有子子孙孙标签
    # body = soup.find('body')
    # v = body.descendants
    # 一直找一条线找到底才会去第二条线,找所有的后代
    # 5. clear,将标签的所有子标签全部清空(保留标签名)
    # tag = soup.find('body')
    # tag.clear()
    # print(soup)
    # 6. decompose,递归的删除所有的标签(不保留标签名)
    # body = soup.find('body')
    # body.decompose()
    # print(soup)
    # 7. extract,递归的删除所有的标签,并获取删除的标签(返回一个删除的标签,有点类似pop)
    # body = soup.find('body')
    # v = body.extract()
    # print(soup)
    # 8. decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
    # body = soup.find('body')
    # v = body.decode()
    # v = body.decode_contents()
    # print(v)
    # 9. encode,转换为字节(含当前标签);encode_contents(不含当前标签)
    # body = soup.find('body')
    # v = body.encode()
    # v = body.encode_contents()
    # print(v)
    # 10. find,获取匹配的第一个标签
    # tag = soup.find('a')
    # print(tag)
    # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
    # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
    # print(tag)
    # 11. find_all,获取匹配的所有标签
    # tags = soup.find_all('a')
    # print(tags)

    # tags = soup.find_all('a',limit=1)
    # print(tags)

    # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
    # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
    # print(tags)


    # ####### 列表 #######
    # v = soup.find_all(name=['a','div'])
    # print(v)

    # v = soup.find_all(class_=['sister0', 'sister'])
    # print(v)

    # v = soup.find_all(text=['Tillie'])
    # print(v, type(v[0]))


    # v = soup.find_all(id=['link1','link2'])
    # print(v)

    # v = soup.find_all(href=['link1','link2'])
    # print(v)

    # ####### 正则 #######
    # import re
    # rep = re.compile('p')
    # rep = re.compile('^p')
    # v = soup.find_all(name=rep)
    # print(v)

    # rep = re.compile('sister.*')
    # v = soup.find_all(class_=rep)
    # print(v)

    # rep = re.compile('http://www.oldboy.com/static/.*')
    # v = soup.find_all(href=rep)
    # print(v)

    # ####### 方法筛选 #######
    # def func(tag):
    # return tag.has_attr('class') and tag.has_attr('id')
    # v = soup.find_all(name=func)
    # print(v)


    # ## get,获取标签属性
    # tag = soup.find('a')
    # v = tag.get('id')
    # print(v)
    # 12. has_attr,检查标签是否具有该属性
    # tag = soup.find('a')
    # v = tag.has_attr('id')
    # print(v)
    # 13. get_text,获取标签内部文本内容
    # tag = soup.find('a')
    # v = tag.get_text('id')
    # print(v)
    # 14. index,检查标签在某标签中的索引位置
    # tag = soup.find('body')
    # v = tag.index(tag.find('div'))
    # print(v)

    # tag = soup.find('body')
    # for i,v in enumerate(tag):
    # print(i,v)
    # 15. is_empty_element,是否是空标签(是否可以是空)或者自闭合标签,
    # 判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
    # tag = soup.find('br')
    # v = tag.is_empty_element
    # print(v)
    # 16. 当前的关联标签
    # soup.next
    # soup.next_element
    # soup.next_elements
    # soup.next_sibling
    # soup.next_siblings
    #
    # tag.previous
    # tag.previous_element
    # tag.previous_elements
    # tag.previous_sibling
    # tag.previous_siblings
    #
    # tag.parent
    # tag.parents
    # 17. 查找某标签的关联标签
    # tag.find_next(...)
    # tag.find_all_next(...)
    # tag.find_next_sibling(...)
    # tag.find_next_siblings(...)
    # tag.find_previous(...)
    # tag.find_all_previous(...)
    # tag.find_previous_sibling(...)
    # tag.find_previous_siblings(...)
    # tag.find_parent(...)
    # tag.find_parents(...)
    # 参数同find_all
    # 18. select,select_one, CSS选择器
    # soup.select("title")
    #
    # soup.select("p nth-of-type(3)")
    #
    # soup.select("body a")
    #
    # soup.select("html head title")
    #
    # tag = soup.select("span,a")
    #
    # soup.select("head > title")
    #
    # soup.select("p > a")
    #
    # soup.select("p > a:nth-of-type(2)")
    #
    # soup.select("p > #link1")
    #
    # soup.select("body > a")
    #
    # soup.select("#link1 ~ .sister")
    #
    # soup.select("#link1 + .sister")
    #
    # soup.select(".sister")
    #
    # soup.select("[class~=sister]")
    #
    # soup.select("#link1")
    #
    # soup.select("a#link2")
    #
    # soup.select('a[href]')
    #
    # soup.select('a[href="http://example.com/elsie"]')
    #
    # soup.select('a[href^="http://example.com/"]')
    #
    # soup.select('a[href$="tillie"]')
    #
    # soup.select('a[href*=".com/el"]')
    #
    # from bs4.element import Tag
    #
    #
    # def default_candidate_generator(tag):
    # for child in tag.descendants:
    # if not isinstance(child, Tag):
    # continue
    # if not child.has_attr('href'):
    # continue
    # yield child
    #
    #
    # tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
    # print(type(tags), tags)
    #
    # from bs4.element import Tag
    #
    #
    # def default_candidate_generator(tag):
    # for child in tag.descendants:
    # if not isinstance(child, Tag):
    # continue
    # if not child.has_attr('href'):
    # continue
    # yield child
    #
    # tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
    # print(type(tags), tags)
    # 19. 标签的内容
    # tag = soup.find('span')
    # print(tag.string) # 获取
    # tag.string = 'new content' # 设置
    # print(soup)

    # tag = soup.find('body')
    # print(tag.string)
    # tag.string = 'xxx'
    # print(soup)

    # tag = soup.find('body')
    # v = tag.stripped_strings # 递归内部获取所有标签的文本
    # print(v)
    # 20.append在当前标签内部追加一个标签
    # tag = soup.find('body')
    # tag.append(soup.find('a'))
    # print(soup)
    #
    # from bs4.element import Tag
    # obj = Tag(name='i',attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.append(obj)
    # print(soup)
    # 21.insert在当前标签内部指定位置插入一个标签
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.insert(2, obj)
    # print(soup)
    # 22. insert_after,insert_before 在当前标签后面或前面插入
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # # tag.insert_before(obj)
    # tag.insert_after(obj)
    # print(soup)
    # 23. replace_with 在当前标签替换为指定标签
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('div')
    # tag.replace_with(obj)
    # print(soup)
    # 24. 创建标签之间的关系
    # tag = soup.find('div')
    # a = soup.find('a')
    # tag.setup(previous_sibling=a)
    # print(tag.previous_sibling)
    # 25. wrap,将指定标签把当前标签包裹起来
    # from bs4.element import Tag
    # obj1 = Tag(name='div', attrs={'id': 'it'})
    # obj1.string = '我是一个新来的'
    #
    # tag = soup.find('a')
    # v = tag.wrap(obj1)
    # print(soup)

    # tag = soup.find('a')
    # v = tag.wrap(soup.find('p'))
    # print(soup)
    # 26. unwrap,去掉当前标签,将保留其包裹的标签
    # tag = soup.find('a')
    # v = tag.unwrap()
    # print(soup)
    Win a contest, win a challenge
  • 相关阅读:
    数据库事务的四大特性以及事务的隔离级别
    informer使用示例
    Linux内存、Swap、Cache、Buffer详细解析
    浏览器访问百度的整个过程
    安装zookeeper
    配置java环境
    promethues开发
    go mod常用操作说明
    redis使用基础
    channel的声明和使用
  • 原文地址:https://www.cnblogs.com/pandaboy1123/p/9169338.html
Copyright © 2020-2023  润新知