• 毛概复习资料spider


    为了复习,把毛概题库趴下来。

    github:https://github.com/xzb123/-spider

    # encoding = 'utf-8'
    import requests,re,os
    from bs4 import BeautifulSoup
    
    #name 表示文件名字
    #url 表示题目路径
    #将这个路径的题目保存为name文件
    def save_html(name,url):
        '''
        #抓取页面
        res = requests.get(url,headers = headers)
        res.encoding = 'utf-8-bom'
        #页面内容
        text = res.content
        #内容写入一个文件
        f = open("temp.html",'wb')
        f.write(text)
        f.close()
        '''
        get_html("temp.html",url)
        #读出内容
        tf = open("temp.html",'r')
        input = tf.read()
        tf.close()
        
        #正则表达式替换掉部分内容
        output1 = re.sub('<iframe (.*)',' ',input)
        output2 = re.sub("<input type='hidden' (.)* value",' ',output1)
        #将最终内容写入name文件
        f = open(name,'w')
        f.write(output2)
        f.close()
        print(name," has saved ok!")
    
    #根据mom.html文件内容得到需要进一步爬取的sonurl
    def get_sonurl(url_list):
        f = open('mom.html','r')
        text = f.read()
        soup = BeautifulSoup(text,'html.parser')
        f.close()
        a_list = soup.find_all('a')
        for i in a_list:
    #        print(i['href'])
            #find_all得到的list中每个是slice对象,可以直接得到属性,很舒服呀~
            url_list.append(i['href'])
    #    print(url_list)
        print("find sonurl ok!")
    
    #根据url获取mom.html
    def get_html(name,url):
        f = open(name,'wb')
        res = requests.get(url,headers = headers)
        res.encoding = 'utf-8-bom'
        text = res.content
        f.write(text)
        f.close()
        print(name + " " + url," has get ok!")
        
    if __name__ == "__main__":
        print("welcome!")
        print("input your cookie:
    like this 
    JSESSIONID=7CBC32078B041854FE8EFAEE1A961E98; DWRSESSIONID=Eghu$3ODyHz35BCqw4xEZV5ugJm")
        cookie = 'SESSIONID=7CBC32078B041854FE8EFAEE1A961E98; DWRSESSIONID=Eghu$3ODyHz35BCqw4xEZV5ugJm'
        cookie = input()
        print("input path:
    like this 
     xzb/")
        path = input()
        if os.path.exists(path):
            pass
        else:
            os.mkdir(path)
        #header信息,主要是cookie
        headers = {}
        headers['Cookie'] = cookie
        
        momurl = 'http://222.195.158.247/meol/common/question/test/student/stu_qtest_more_result.jsp?testId=22400165'
        faurl = 'http://222.195.158.247/meol/common/question/test/student/'
        #得到mom.html
        get_html("mom.html",momurl)
        url_list = []
        #解析mom.html
        get_sonurl(url_list)
    #    print(url_list)
        cnt = 1
        for sonurl in url_list:
            url = faurl + sonurl
            name = path + str(cnt) + '.html'
            print(name)
            cnt = cnt+1
            save_html(name,url)
        os.remove("temp.html")
        os.remove("mom.html")
        
        print("finish!")
  • 相关阅读:
    前端资源网址
    IDEA激活工具
    新建jsp项目
    jsp笔记
    iOS的SVN
    iOS学习网站
    测试接口工具
    MVP模式
    关于RxJava防抖操作(转)
    注释模板
  • 原文地址:https://www.cnblogs.com/gudygudy/p/11020303.html
Copyright © 2020-2023  润新知