• 2017-11-11 Sa Oct Spider


    2017-11-11 Sa Oct Spider

    4:33 PM

    Again.

    Firstly test liburl:

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def openWithBrowser(filename):
        os.system('python -m webbrowser "{}"'.format(filename))
    
    name = 'xxx'
    no = 'xxx'
    
    hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
    urllib2.install_opener(opener)
    h = urllib2.urlopen(hosturl)
    
    headers = {
        'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
        'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    }
    
    idx = name + ' ' + no
    
    postData = {
        '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
        '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
        'name' : name,
        'pwd' : '12345',
        'btnchange' : '登录',
        'xuehao' : no
    }
    
    postData = urllib.urlencode(postData)
    request = urllib2.Request(posturl, postData, headers)
    response = urllib2.urlopen(request, timeout=5)
    
    with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
        f.write(response.read())
        openWithBrowser(f.name)
    

    Good. Nothing changed. Them apply the table.

    5:09 PM

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def openWithBrowser(filename):
        os.system('python -m webbrowser "{}"'.format(filename))
    
    version = datetime.datetime.now().strftime("%y-%m-%d %a %b %H-%M-%S result")
    os.mkdir(version)
    
    hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
    urllib2.install_opener(opener)
    h = urllib2.urlopen(hosturl)
    
    headers = {
        'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
        'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    }
    
    def get(name, no):
        global hosturl, posturl, cj, cookie_support, opener, h, headers
    
        postData = {
            '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
            '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
            'name' : name,
            'pwd' : '12345',
            'btnchange' : '登录',
            'xuehao' : no
        }
    
        postData = urllib.urlencode(postData)
        request = urllib2.Request(posturl, postData, headers)
        response = urllib2.urlopen(request, timeout=5)
    
        with open('{}/{}.html'.format(version, no), 'w') as f:
            f.write(response.read().replace('<head>', '<head><meta charset="utf-8">'))
    
    with open('result_utf8.csv', "rb") as f:
        print version
        for line in f:
            (name, no, x1, x2) = line.split(',')
            try:
                get(name, no)
            except:
                pass
    

    It took some time to output to Chinese filename. Gave up eventually. It even raised exception when I printed name (Chinese) to the console (decode stuff).

    Then I'd write a reporter.

    6:41 PM

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    import Tkinter
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    csv = [line.split(',') for line in open('result_utf8.csv')]
    
    def getname(no):
        for i in csv:
            if i[1] == no:
                return i[0]
        return ''
    
    def getcourse(filename):
        s = open(filename).read()
        i = s.find('退选')
    
        if i != -1:
            trbegin = s.find('<tr>', i)
    
        # s[trbegin...] e.g.
        # <tr>
        #                                         <td width="10%">
        #                                     <a id="GridView1_ctl02_LinkButton1" href="ja
        # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton1&#39;,&#39;&#39;)">348</a>
        # 
        #                                 </td><td>12</td><td>生物培优班</td><td>xxx</td><td>&n
        # bsp;</td><td width="10%">
        #                                     <a id="GridView1_ctl02_LinkButton2" href="ja
        # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton2&#39;,&#39;&#39;)">退选</a>
        #                                </td>
    
            trend = s.find('</tr>', trbegin)
    
            read = 0
            res = ''
    
            i = trbegin
            while i < trend:
                if s[i] == '<':
                    while s[i] != '>':
                        i += 1
                    i += 1
                    continue
    
                end = False
                while s[i] != '<':
                    if s[i] == '&':
                        end = True
                        break
                    res += s[i]
                    i += 1
    
                if end:
                    break
    
                res += ' '
            
            res2 = ''
            i = 0
            while i < len(res) and not (res[i] in "0123456789"):
                i += 1
    
            while i < len(res):
                if res[i] == '
    ':
                    i += 1
                else:
                    res2 += res[i]
                    if res[i] == ' ':
                        while i < len(res) and res[i] == ' ':
                            i += 1
                    else:
                        i += 1
    
            return res2
        return ''
    
    def report():
        with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:  
            wd = workdir.get()
    
            os.chdir(wd)
            f.write('<head><meta charset="utf-8"></head>')
            f.write('<h1>Spider report</h1>')
            f.write('<p><b>Version {}</b></p>'.format(wd))
            f.write('<table>')
    
            for i in os.listdir('.'):
                (no, x1) = i.split('.')
                name = getname(no)
                s = getcourse(i)
                f.write('<tr><th>{}</th><th>{}</th><td>{}</td></tr>'.format(no, name, s))
    
            os.system('python -m webbrowser {}'.format(f.name))
            os.chdir('..')
    
    gui = Tkinter.Tk()
    workdir = Tkinter.StringVar()
    Tkinter.Button(gui, text="Report", command=report).pack(side=Tkinter.LEFT)
    Tkinter.Entry(gui, textvariable=workdir, width=40).pack(side=Tkinter.LEFT)
    gui.mainloop()
    
  • 相关阅读:
    Java中常见时间类的使用
    springboot2.0介绍1
    Element-ui-Basic
    Java开发中的23中设计模式详解(一)工厂方法模式和抽象工厂模式
    CSS3 变形、过渡、动画、关联属性浅析
    Webpack 入门教程
    ES6对象简洁语法
    如何下载ts文件
    PPT转PDF
    python实践
  • 原文地址:https://www.cnblogs.com/yanhuihang/p/7819698.html
Copyright © 2020-2023  润新知