2017-11-11 Sa Oct Spider
4:33 PM
Again.
Firstly test liburl:
# -*- coding: utf-8 -*-
import json
import datetime
import HTMLParser
import urlparse
import urllib
import urllib2
import cookielib
import string
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable
reload(sys)
sys.setdefaultencoding("utf-8")
def openWithBrowser(filename):
os.system('python -m webbrowser "{}"'.format(filename))
name = 'xxx'
no = 'xxx'
hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
h = urllib2.urlopen(hosturl)
headers = {
'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
}
idx = name + ' ' + no
postData = {
'__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
'__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
'name' : name,
'pwd' : '12345',
'btnchange' : '登录',
'xuehao' : no
}
postData = urllib.urlencode(postData)
request = urllib2.Request(posturl, postData, headers)
response = urllib2.urlopen(request, timeout=5)
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
f.write(response.read())
openWithBrowser(f.name)
Good. Nothing changed. Them apply the table.
5:09 PM
# -*- coding: utf-8 -*-
import json
import datetime
import HTMLParser
import urlparse
import urllib
import urllib2
import cookielib
import string
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable
reload(sys)
sys.setdefaultencoding("utf-8")
def openWithBrowser(filename):
os.system('python -m webbrowser "{}"'.format(filename))
version = datetime.datetime.now().strftime("%y-%m-%d %a %b %H-%M-%S result")
os.mkdir(version)
hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
h = urllib2.urlopen(hosturl)
headers = {
'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
}
def get(name, no):
global hosturl, posturl, cj, cookie_support, opener, h, headers
postData = {
'__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
'__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
'name' : name,
'pwd' : '12345',
'btnchange' : '登录',
'xuehao' : no
}
postData = urllib.urlencode(postData)
request = urllib2.Request(posturl, postData, headers)
response = urllib2.urlopen(request, timeout=5)
with open('{}/{}.html'.format(version, no), 'w') as f:
f.write(response.read().replace('<head>', '<head><meta charset="utf-8">'))
with open('result_utf8.csv', "rb") as f:
print version
for line in f:
(name, no, x1, x2) = line.split(',')
try:
get(name, no)
except:
pass
It took some time to output to Chinese filename. Gave up eventually. It even raised exception when I printed name
(Chinese) to the console (decode stuff).
Then I'd write a reporter.
6:41 PM
# -*- coding: utf-8 -*-
import json
import datetime
import HTMLParser
import urlparse
import urllib
import urllib2
import cookielib
import string
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable
import Tkinter
reload(sys)
sys.setdefaultencoding("utf-8")
csv = [line.split(',') for line in open('result_utf8.csv')]
def getname(no):
for i in csv:
if i[1] == no:
return i[0]
return ''
def getcourse(filename):
s = open(filename).read()
i = s.find('退选')
if i != -1:
trbegin = s.find('<tr>', i)
# s[trbegin...] e.g.
# <tr>
# <td width="10%">
# <a id="GridView1_ctl02_LinkButton1" href="ja
# vascript:__doPostBack('GridView1$ctl02$LinkButton1','')">348</a>
#
# </td><td>12</td><td>生物培优班</td><td>xxx</td><td>&n
# bsp;</td><td width="10%">
# <a id="GridView1_ctl02_LinkButton2" href="ja
# vascript:__doPostBack('GridView1$ctl02$LinkButton2','')">退选</a>
# </td>
trend = s.find('</tr>', trbegin)
read = 0
res = ''
i = trbegin
while i < trend:
if s[i] == '<':
while s[i] != '>':
i += 1
i += 1
continue
end = False
while s[i] != '<':
if s[i] == '&':
end = True
break
res += s[i]
i += 1
if end:
break
res += ' '
res2 = ''
i = 0
while i < len(res) and not (res[i] in "0123456789"):
i += 1
while i < len(res):
if res[i] == '
':
i += 1
else:
res2 += res[i]
if res[i] == ' ':
while i < len(res) and res[i] == ' ':
i += 1
else:
i += 1
return res2
return ''
def report():
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
wd = workdir.get()
os.chdir(wd)
f.write('<head><meta charset="utf-8"></head>')
f.write('<h1>Spider report</h1>')
f.write('<p><b>Version {}</b></p>'.format(wd))
f.write('<table>')
for i in os.listdir('.'):
(no, x1) = i.split('.')
name = getname(no)
s = getcourse(i)
f.write('<tr><th>{}</th><th>{}</th><td>{}</td></tr>'.format(no, name, s))
os.system('python -m webbrowser {}'.format(f.name))
os.chdir('..')
gui = Tkinter.Tk()
workdir = Tkinter.StringVar()
Tkinter.Button(gui, text="Report", command=report).pack(side=Tkinter.LEFT)
Tkinter.Entry(gui, textvariable=workdir, width=40).pack(side=Tkinter.LEFT)
gui.mainloop()