爬虫-----爬取所有国家的首都、面积，并保存到txt文件中

# -*- coding:utf-8 -*-

import urllib2
import lxml.html
from lxml import etree

def main():
    file = open('./countrys.txt', 'w+')
    file.close()
    countrys = []
    url = 'https://guojiadiqu.51240.com/'
    html = urllib2.urlopen(url).read()
    # tree = lxml.html.fromstring(html)
    # td = tree.cssselect('div#main_content > ul.list > li > a > @href')[0]
    selector = etree.HTML(html)
    uls = selector.xpath('//div[@id="main_content"]/ul')
    for ul in uls:
        lis = ul.xpath('./li')
        for li in lis:
            country_infos = {}
            key = li.xpath('./a/text()')[0]
            value = 'https://guojiadiqu.51240.com' + li.xpath('./a/@href')[0].strip()
            country_infos[key] = value
            countrys.append(country_infos)
    return get_capital(countrys)

def get_capital(list):
    i = 0
    for country in list:
        i += 1
        name = country.keys()[0]
        url = country.values()[0]
        html = urllib2.urlopen(url).read()
        tree = etree.HTML(html)
        tr = tree.xpath('//div[@id="main_content"]/table')[0]
        tr1 = tr.xpath('./tr/td/table/tr')
        tr2 = tr1[2].xpath('./td/text()')
        file = open('./countrys.txt', 'a')
        if len(tr2) > 0:
            content = str(i) + '  ' + name + '
   ' + tr2[0] + '
'
        else:
            content = str(i) + '  ' + name + '
' + '   
'
        file.write(content.encode('utf-8'))
        file.close()

if __name__ == "__main__":
    main()

相关阅读:
sass接触
css 文字超出部分显示省略号(原)
vue组件
字节流
File类、递归
异常
静态导入、可变参数、Collections集合工具类、集合嵌套
Map接口
Set接口
List接口

原文地址：https://www.cnblogs.com/wozuilang-mdzz/p/9737265.html

爬虫-----爬取所有国家的首都、面积 ，并保存到txt文件中

爬虫-----爬取所有国家的首都、面积，并保存到txt文件中