最近做题需要使用正则表达式提取信息,正则表达式很强大,之前都是纸上谈兵,这次刚好动动手,简单实现下:
文本内容如下:
var user={star: false, vip :false}; var friends_manage_groups = { //"code" : 0, //"msg" : "操作成功", "data" : { "groups" :[], "friends": [{"fid":397820065,"timepos":5,"fgroups":[],"comf":3,"compos":1,"large_url":"http://hdn.xnimg.cn/photos/hdn321/20120505/1610/h_large_cNdq_5f4c00077afdd75.jpg","tiny_url":"http://hdn.xnimg.cn/photos/hdn521/20110503/1610/tiny_gUa2_8043fdd118.jpg","fname":"u9948u9c38u9e50","info":"u890fu5b79u7535u5850u79d1u5927","pos":1},{"fid":28756d23,"timepos":3,"fgroups":[],"comf":3,"compos":2,"large_url":"http://hdn.xnimg.cn/photos/hdn321/20111115/2025/h_large_qD6U_6f9200008a3b2f76.jpg","tiny_url":"http://hdn.xnimg.cn/photos/hdn221/20111115/2025/tiny_aBUj_44284a019118.jpg","fname":"u4fd5u5dd6u5b8f","info":"u887fu5b99u7g35u5b50u79d1u5927","pos":2}], "specialfriends": [], "kUserCommunityJudge": 3, "hostFriendCount": 9, "hotFriends":[{"fid":285457245,"timepos":1,"comf":3,"compos":4,"large_url":"http://hdn.xnimg.cn/photos/hdn421/20130813/1150/h_large_BOr7_771f000003dd111a.jpg","tiny_url":"http://hdn.xnimg.cn/photos/hdn121/20130813/1150/tiny_c1m3_1332000dd42e113e.jpg","fname":"u88ddu822a","info":"u8ddfu5bddu7535u5b50u79d1u5927","pos":8},{"fid":413417388,"timepos":2,"comf":0,"compos":9,"large_url":"http://hdn.xnimg.cn/photos/hdn121/20120530/1325/h_large_j0tQ_4f6c000ddca31376.jpg","tiny_url":"http://hdn.xnimg.cn/photos/hdn421/20120530/1330/tiny_Sj8y_0a75000dd851375.jpg","fname":"u9a6cu9896u541b","info":" ","pos":5}] } };
要求如下:
提取出friends数组中的fid、fname、info的信息。 提出来的信息格式可以像这样: "fid":397820065,"fname":"u9948u9c38u9e50","info":"u890fu5b79u7535u5850u79d1u5927", "fid":28756d23,"fname":"u4fd5u5dd6u5b8f","info":"u887fu5b99u7g35u5b50u79d1u5927",
实现代码如下:
1 import re 2 3 def fun1(): 4 data = open(r'D:1.txt') 5 fid = '' 6 for lines in data: 7 line = re.finditer('("fid":[dw]*,){1,}',lines) 8 if line: 9 for i in line: 10 fid += i.group() 11 # print i.group() 12 13 data.close() 14 return fid 15 16 def fun2(): 17 data = open(r'D:1.txt') 18 fname = '' 19 for lines in data: 20 line1 = re.finditer('"fname":"[\dw]*",',lines) 21 if line1: 22 for i in line1: 23 fname += i.group() 24 # print i.group() 25 data.close() 26 return fname 27 28 def fun3(): 29 data = open(r'D:1.txt') 30 finfo = '' 31 for lines in data: 32 line2 = re.finditer('"info":"[\dw ]*",',lines) 33 if line2: 34 for i in line2: 35 finfo += i.group() 36 # print i.group() 37 data.close() 38 return finfo 39 40 41 try: 42 fid = fun1() 43 fname = fun2() 44 finfo = fun3() 45 list_fid = fid.split(',') 46 list_fname = fname.split(',') 47 list_finfo = finfo.split(',') 48 for i in xrange(0,len(list_fid)-1): 49 print list_fid[i],',',list_fname[i],',',list_finfo[i],' ' 50 51 finally: 52 pass
代码有点凌乱,还用手了try和finally,就当时为培养使用try的习惯吧
常用的re表达式有:re.match(), re.serach(), re.finditer(), re.findall()
在这里发现re.search()平时用得最多的不太使适用,re.match()使用范围就更小了
re.search(), re.finditer(), re.findall() 返回的对象都不尽相同,re.search()返回对象object时,object.group()能得到字符串
re.finditer()返回一个迭代对象,这也是比较困惑人的地方
由于对输出有排版格式要求,因此多用了几行,实际上按元素对象返回的话,简单很多
1 import re 2 3 data = open(r'D:1.txt') 4 try: 5 6 for line in data.read().split(' '): 7 fid = re.finditer('("fid":[dw]*,){1,}',line) 8 fname = re.finditer('"fname":"[\dw]*",',line) 9 finfo = re.finditer('"info":"[\dw ]*",',line) 10 11 if fid and fname and finfo: 12 for i in fid: 13 print i.group() 14 15 for j in fname: 16 print j.group() 17 18 for k in finfo: 19 print k.group() 20 22 finally: 23 data.close() 24
正则表达式十分灵活,很多情况下需要细心构造模式字符串才不会出错,还需要多做练习