• 百度百科-人物数据采集


    import json
    import re
    
    import requests
    from urllib.parse import quote
    
    from bs4 import BeautifulSoup
    from pyquery import PyQuery as pq
    
    
    class BaiDuPerson:
        def __init__(self, name):
            self.temp_url = 'https://baike.baidu.com/search/word?word='
            self.headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
            }
            self.response = ''
            self.save_path = r'E:百度json文件'
            self.name = name
            self.run()
    
        def get_response(self):
            url = self.temp_url + quote(self.name)
            response = requests.get(url=url, headers=self.headers)
            self.response = response.content.decode('utf8')
    
        def check_ambiguity(self):
            """校验人名是否有歧义--多个人名指代"""
            doc = pq(self.response)
            ul = doc('.polysemantList-wrapper.cmn-clearfix')
            if ul:
                return True
            else:
                return False
    
        def get_introduction(self):
            """
            获得简介
            """
            soup = BeautifulSoup(self.response, "lxml")
            try:
    
                result = soup.select(".lemma-summary")[0].text
                result = "".join(result)
            except:
                result = ''
            return result
    
        def get_person_lifetime(self):
            """
            获取生平数据
            """
            res = self.response.split('<h2 class="title-text"')
            h2_dict = {}
            if len(res) == 1:
                doc = pq(self.response)
                content = doc('.para').text()
                h2_dict['生平'] = content
            else:
                for h2 in res[1:]:
                    tmp2 = {}
                    if '<div class="album-list">' in h2:
                        h2 = h2.split('<div class="album-list">')[0]
                    if '<dt class="reference-title"' in h2:
                        h2 = h2.split('<dt class="reference-title"')[0]
                    if '<div class="rs - container - foot"' in h2:
                        h2 = h2.split('<div class="rs - container - foot"')[0]
                    if '<div class="tashuo-bottom"' in h2:
                        h2 = h2.split('<div class="tashuo-bottom"')[0]
                    if '<div class="go-auth-box"' in h2:
                        h2 = h2.split('<div class="go-auth-box"')[0]
                    if '<div class="side-content">' in h2:
                        h2 = h2.split('<div class="side-content">')[0]
                    h2 = '<h2 class="title-text"' + h2
                    soup = BeautifulSoup(h2, "lxml")
                    h2_key = soup.find("h2").get_text().replace(self.name, '').strip()
                    h3_dict = {}
                    if "<h3" in h2:
                        for h3 in h2.split("<h3")[1:]:
                            tmp3 = {}
                            h3 = "<h3" + h3
                            soup = BeautifulSoup(h3, "lxml")
                            replace = soup.find("h3").get_text()
                            h3_title = replace.replace(self.name, '').strip()
                            if "<ul" in h3:
                                res = h3.split("<ul")
                                ul_dict = {}
                                for ul in res[1:]:
                                    ul = "<ul" + ul
                                    soup = BeautifulSoup(ul, "lxml")
                                    ul_title = soup.find("ul").get_text().replace(self.name, '').strip()
                                    tmp1 = {}
    
                                    for item in ul.split("</ul>")[1:]:
                                        v_list = []  # 存储多个关系
                                        soup = BeautifulSoup(item, "lxml")
                                        ul_vlist = soup.find_all("div")
                                        for i in ul_vlist:
                                            ul_v = i.get_text().replace("xa0", '')
                                            for shangbiao in re.findall(re.compile(r"[d+]"), ul_v):
                                                ul_v = ul_v.replace(shangbiao, "")
                                            if ul_v == '':
                                                continue
                                            else:
                                                v_list.append(ul_v)
                                        tmp1[ul_title] = v_list
                                    ul_dict.update(tmp1)
                                h3_dict.update(ul_dict)
                            else:
                                h3_v = soup.get_text().replace(replace, "").replace("xa0", '')
                                for shangbiao in re.findall(re.compile(r"[d+]"), h3_v):
                                    h3_v = h3_v.replace(shangbiao, "")
                                tmp3[h3_title] = [h3_v]
                                h3_dict.update(tmp3)
                            tmp2 = {h2_key: h3_dict}
                        h2_dict.update(tmp2)
                    else:
                        h2_v = soup.get_text().replace(soup.find("h2").get_text(), "").replace("xa0", '')
                        for shangbiao in re.findall(re.compile(r"[d+]"), h2_v):
                            h2_v = h2_v.replace(shangbiao, "")
    
                        h2_v = h2_v.split("
    ")
                        h2_v_list = []
                        for item in h2_v:
                            if item and (not item == '编辑'):
                                h2_v_list.append(item)
    
                        tmp = {h2_key: h2_v_list}
                        h2_dict.update(tmp)
            return h2_dict
    
        def get_relationship(self):
            """
            获取人物关系
            """
            relationship = []
            soup = BeautifulSoup(self.response, "lxml")
            res_ship = soup.select(".info .name")
            res_value = soup.select(".info .title")
            for i in range(len(res_ship)):
                temp = []
                temp.append(self.name)
                temp.append(res_ship[i].string)
                temp.append(res_value[i].string)
                relationship.append(temp)
            return relationship
    
        def get_person_details(self):
            """获取人物标签栏数据"""
            doc = pq(self.response)
            person_detail_key_doc_list = doc('.basic-info.cmn-clearfix dt').items()
            person_detail_key_list = []
            for key_doc in person_detail_key_doc_list:
                person_detail_key = key_doc.text().replace(' ','')
                person_detail_key_list.append(person_detail_key)
            person_detail_value_doc_list = doc('.basic-info.cmn-clearfix dd').items()
            person_detail_value_list = []
            for value_doc in person_detail_value_doc_list:
                person_detail_value = value_doc.text().replace(' ','')
                person_detail_value_list.append(person_detail_value)
            person_detail_dict = dict(zip(person_detail_key_list, person_detail_value_list))
            return person_detail_dict
    
        def get_name(self):
            """抓取的首页的人物名字"""
            soup = BeautifulSoup(self.response, "lxml")
            try:
                name = soup.find("h1").text
            except:
                name = ''
            return name
    
        def run(self):
            self.get_response()
            check_ambiguity_result = self.check_ambiguity()
            if check_ambiguity_result:
                with open('有歧义.txt', 'a', encoding='utf8') as f:
                    f.write(self.name+'
    ')
            else:
                introduction = self.get_introduction()
                person_name = self.get_name()
                relationship = self.get_relationship()
                person_lifetime = self.get_person_lifetime()
                person_detail = self.get_person_details()
                person_information = dict()
                person_information['Introduction'] = introduction
                person_information['Rel'] = relationship
                person_information['Details'] = person_detail
                person_information.update(person_lifetime)
                with open(self.save_path+'\'+person_name+'.json', 'w', encoding='utf8') as f:
                    f.write(json.dumps(person_information, ensure_ascii=False))
    
    
    if __name__ == '__main__':
        name = '裴寂'
        BaiDuPerson(name)
  • 相关阅读:
    selenium笔记
    gbase笔记
    oracle向gbase数据迁移须知
    字符串表达式的计算
    Struts2的安装
    JSP+MySQL中文乱码
    HTML:<input type="text">文本框不可编辑的方式
    HTML中使用<input>添加的按钮打开一个链接
    AS3.0:给图片添加滤镜模糊与斜角效果
    SQLServer 2008以上误操作数据库恢复方法—日志尾部备份
  • 原文地址:https://www.cnblogs.com/lqn404/p/13827435.html
Copyright © 2020-2023  润新知