• 爬虫姓名测试打分2


    一、获取汉字

    import pandas as pd
    import requests
    from bs4 import BeautifulSoup
    session=requests.session()
    
    #http://xh.5156edu.com/pinyi.html 所有拼音的导航地址
    #https://www.xingming.com/dafen/ 测试得分。 ⺋
    url1="http://xh.5156edu.com/pinyi.html"
    
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }
    r1=session.get(url1, headers=headers)
    soup = BeautifulSoup(r1.content, 'lxml')
    
    list1 = soup.select("tr > td > a.fontbox")
    
    list2=[] # 每个拼音连接
    for i in list1:
        list2.append([i.get("href"),i.text.strip()])
    
    
    
    def f2(url2):#返回汉字
        #url2 = "http://xh.5156edu.com/html2/p105.html"
        r2=session.get(url2, headers=headers)
        r2.encoding = 'gb18030' 
        soup = BeautifulSoup(r2.text, 'lxml')
        list3 = soup.select("a.fontbox")
        list4 = []
        for i in list3:
            list4.append(i.text[0])
        return list4
    
    
    
    import time
    list5=[]
    for i in list2:
        i2 = "http://xh.5156edu.com/"+i[0]
        print(i2)
        list5.append(f2(i2))
        time.sleep(1)
    
        
    #写出汉字
    with open("hanzi.txt","w",encoding="utf8") as f:
        for i in list5:
            f.write("|".join(i)+"\n")
        f.close()
        
    View Code

    二、获取打分网站的评分

    # -*- coding: utf-8 -*-
    """
    Created on Sun Nov 21 22:31:06 2021
    
    @author: Administrator
    """
    
    import pandas as pd
    import requests
    from bs4 import BeautifulSoup
    session=requests.session()
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }
    
    with open("hanzi.txt","r",encoding="utf8") as f:
        list1 = f.readlines()
        f.close()
        
    
    # 展开所有字典 21763
    list2=[]
    for i in list1:
        i2 = i.strip().split("|")
        if len(i2)>0:
            list2.extend(i2)
            
    # 
    def ff3(ming):
        #ming="堂"
        url3 = "https://www.xingming.com/dafen/"
        dict0={'xs': '',
        'mz': f'金{ming}',
        'action': 'test'}
        r4 = requests.post(url3,data=dict0, headers=headers)
        soup = BeautifulSoup(r4.content, 'lxml')
        try:
            score = soup.select("font[color='ff0000']")[0].text
        except IndexError :
            score = soup.text[:15]
        return score
        
    ming=""
    ff3(ming)
    
    ming=""
    url3 = "https://www.xingming.com/dafen/"
    dict0={'xs': '',
    'mz': f'金{ming}',
    'action': 'test'}
    r4 = requests.post(url3,data=dict0, headers=headers)
    soup = BeautifulSoup(r4.content, 'lxml')
    try:
        1/0
        score = soup.select("font[color='ff0000']")[0].text
    except IndexError :
        score = soup.text[:15]
    score
    
    
    df1 = pd.DataFrame([[i,None] for i in list2])
    df1.columns=['1','2']
    df1 = df1.drop_duplicates().reset_index(drop=True).copy()
    
    
    df2 = df1[df1['2']!=None].copy()
    import datetime,time
    
    for i in range(df2.shape[0]):
        now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        ming = df2.iloc[i,0]
        if df2.iloc[i,1]!=None:
            continue
        try:
            soc = ff3(ming)
            try:
                soc=float(soc)
                df2.iloc[i,1] = soc
            except:
                df2.iloc[i,1] = soc
        except Exception as e:
            print(now_time,"----err---",str(e),df2.iloc[i,0])
        if i%100 == 0:
            print(now_time,"----------",i)  
        time.sleep(0.2)
    
    
    
    df2.to_excel("soc.xlsx")
    
    
    df3 = df2
  • 相关阅读:
    C#值类型和引用类型的不同
    C# new用法总结-转
    C#中New关键词的几种用法
    命令行客户端MySQL的使用(1)——Python
    数据库的基本使用——MySQL
    生成器的创建方式——Python
    with与“上下文管理器”——Python
    魔法属性——Python
    property属性——Python
    正则匹配之贪婪和非贪婪——Python
  • 原文地址:https://www.cnblogs.com/andylhc/p/15832646.html
Copyright © 2020-2023  润新知