1 #-*- coding=utf-8 -*-
2 import requests
3 import re
4 import json
5 import time
6 from PIL import Image
7 import cStringIO
8 import cookielib
9 import urllib
10 import os
11 import xlrd
12
13 from requests.packages.urllib3.exceptions import InsecureRequestWarning,InsecurePlatformWarning
14 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
15 requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
16
17 data=xlrd.open_workbook('1.xlsx')
18 table=data.sheet_by_name(u'Sheet1')
19
20 message_url='https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT'
21 login_url='https://matrix.dean.swust.edu.cn/cas/login'
22 topic_url=''
23 flag=0
24 temp=''
25 pic_count=1
26
27 student = {}
28 student = {
29 '学号':'',
30 '姓名':'',
31 '性别':'',
32 '生日':'',
33 'pic':'',
34 '民族':'',
35 '行政班':'',
36 '专业':'',
37 }
38
39 headers={
40 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
41 }
42
43 session=requests.Session()
44 session.headers=headers
45 session.cookies = cookielib.LWPCookieJar(filename='cookies')
46 # try:
47 # session.cookies.load(ignore_discard=True)
48 # except:
49 # print u"未登陆过,需先登录"
50
51
52 def get_lt(url="https://matrix.dean.swust.edu.cn/cas/login"):
53 '''''_lt 是一个动态变化的参数'''
54 global session
55 index_url = url
56 index_page = session.get(index_url,verify=False)
57 html = index_page.content
58 pattern = r'name="lt" type="hidden" value="(.*?)"'
59 lt = re.findall(pattern, html)
60 return lt[0]
61
62 def login(username,password):
63 global session
64 global topic_url
65 global flag
66 data={
67 'lt':get_lt(),
68 'username':username,
69 'password':password,
70 'service':'https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentPortal:DEFAULT_EVENT',
71 }
72 loginurl=login_url
73 try:
74 login_page=session.post(loginurl,data=data)
75 login_code=login_page.content
76 pattern=r'<a class="btn btn-primary" href="(.*?)"'
77 real_url=re.findall(pattern, login_code)
78 topic_url=real_url[0]
79 flag=1
80 except:
81 pass
82 session.cookies.save()
83
84 def error_clean(error_temp):
85 global student
86 global temp
87 if(error_temp==temp):
88 session.cookies.clear()
89 student = {
90 '学号':'',
91 '姓名':'',
92 '性别':'',
93 '生日':'',
94 'pic':'',
95 '民族':'',
96 '行政班':'',
97 '专业':'',
98 }
99 flag=0
100 topic_url=''
101 else:
102 pass
103
104
105
106
107 def isLogin():
108 global session
109 url = "https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT"
110 login_code = session.get(url, allow_redirects=False).status_code
111 if int(x=login_code) == 200:
112 return True
113 else:
114 return False
115
116 def get_message():
117 global session
118 global topic_url
119 global message_url
120 global student
121
122 html=session.get(topic_url)
123 html=session.get(message_url).text
124
125 pattern_ming=r'<td>(.*?)</td>'
126 pattern_id=r'<span class="number">(.*?)</span>'
127 pattern_pic=r'<td style="padding:0;" width="135" height="180" valign="middle" align="center" rowspan="6"><img width="135" height="180" align="middle" src="(.*?)" /></td>'
128 message_name=re.findall(pattern_ming, html)
129 message_pic=re.findall(pattern_pic, html)
130 try:
131 student['学号']=re.findall(r'<span class="number">(d*?)</span>', message_name[2])[0]
132 student['姓名']=message_name[4]
133 student['性别']=message_name[6]
134 student['专业']=message_name[37]
135 student['行政班']=message_name[27]
136 student['pic']='https://matrix.dean.swust.edu.cn/acadmicManager/student/profile/'+student['学号']+'.jpg'
137
138 except:
139 pass
140
141 #student['生日']=re.findall(r'<span class="number">(.*?)</span>', message_name[8])[0]
142 #student['民族']=message_name[10]
143
144
145 def download():
146 global student
147 global session
148 global temp
149 global pic_count
150 basepath=os.path.abspath('.')
151 savepath=os.path.join(basepath,student['专业'])
152 if not os.path.exists(savepath):
153 os.mkdir(savepath)
154 try:
155 picpath=os.path.join(savepath,student['姓名']+student['学号']+'.jpg')
156 r=session.get(student['pic'])
157 with open(picpath, "wb") as pic:
158 pic.write(r.content)
159 print u'>>>>>>>>>成功抓取>>>>>>>>>>>>>>>>>>>>'+student['姓名']
160 temp=student['姓名']
161 session.cookies.clear()
162 except Exception, e:
163 pass
164
165
166 if __name__ == '__main__':
167 count=table.nrows
168 i=5000
169 while(count>0):
170 if(table.col_values(3)[i]==u'女' and table.col_values(2)[i]!=u'王珀会'):
171 try:
172 login(str(int(table.col_values(1)[i])), str(table.col_values(13)[i])[11:17])
173 except:
174 pass
175 if(flag==1):
176 flag=0
177 get_message()
178 download()
179 count=count-1
180 i=i+1
181 session.cookies.clear()
总结:
python处理excel>> http://www.cnblogs.com/lhj588/archive/2012/01/06/2314181.html
session释放>>
http://stackoverflow.com/questions/23816139/clear-cookies-from-requests-pytho
注明:
1.xlsx为提供学生资料的excel
异常处理之间的妥协关系需要事先计划好