• 爬虫-----爬取所有国家的首都、面积 ,并保存到txt文件中


    # -*- coding:utf-8 -*-

    import urllib2
    import lxml.html
    from lxml import etree

    def main():
    file = open('./countrys.txt', 'w+')
    file.close()
    countrys = []
    url = 'https://guojiadiqu.51240.com/'
    html = urllib2.urlopen(url).read()
    # tree = lxml.html.fromstring(html)
    # td = tree.cssselect('div#main_content > ul.list > li > a > @href')[0]
    selector = etree.HTML(html)
    uls = selector.xpath('//div[@id="main_content"]/ul')
    for ul in uls:
    lis = ul.xpath('./li')
    for li in lis:
    country_infos = {}
    key = li.xpath('./a/text()')[0]
    value = 'https://guojiadiqu.51240.com' + li.xpath('./a/@href')[0].strip()
    country_infos[key] = value
    countrys.append(country_infos)
    return get_capital(countrys)

    def get_capital(list):
    i = 0
    for country in list:
    i += 1
    name = country.keys()[0]
    url = country.values()[0]
    html = urllib2.urlopen(url).read()
    tree = etree.HTML(html)
    tr = tree.xpath('//div[@id="main_content"]/table')[0]
    tr1 = tr.xpath('./tr/td/table/tr')
    tr2 = tr1[2].xpath('./td/text()')
    file = open('./countrys.txt', 'a')
    if len(tr2) > 0:
    content = str(i) + ' ' + name + ' ' + tr2[0] + ' '
    else:
    content = str(i) + ' ' + name + ' ' + ' '
    file.write(content.encode('utf-8'))
    file.close()

    if __name__ == "__main__":
    main()
  • 相关阅读:
    sass接触
    css 文字超出部分显示省略号(原)
    vue组件
    字节流
    File类、递归
    异常
    静态导入、可变参数、Collections集合工具类、集合嵌套
    Map接口
    Set接口
    List接口
  • 原文地址:https://www.cnblogs.com/wozuilang-mdzz/p/9737265.html
Copyright © 2020-2023  润新知