import re def process(input_data): """ 将同时有0号和33的用户IMSI和MSISDN提取出来 :param input_data: 用户信息文件 样例 <SUBBEGIN IMSI=1243560615528273; MSISDN=986768559232; VLRLIST=10; CF=CFD-TS10-REG-91986762386238-YES-NO-20-YES-65535-YES-YES-NO-NO-NO-YES-YES-YES-YES-NO; OPTGPRS=3-33-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=2-2-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=1-0-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=4-33-65535-241-33-NONE-0-3-00000000-25-1-0-EPS_APN-NONE-65535-1; CHARGE_GLOBAL=3; <SUBEND :return: 号码信息列表,每个格式如示例:IMSI=1243560615528273;MSISDN=986768559232 """ # 对文件进行处理,返回可读取的列表 f = open(input_data) message = f.readlines() f.close() list = [] for i in range(len(message)): # list.append(message[i]) # ['<SUBBEGIN ', ' IMSI=1243560615528273; ', ' MSISDN=986768559232; ', list.append(message[i].strip(' ').strip(' ')) # ['<SUBBEGIN', 'IMSI=1243560615528273;', 'MSISDN=986768559232;', # print(list) # print(len(list)) # 对列表进行拆分,获取子列表的索引 start_index = [] stop_index = [] for i in range(len(list)): if list[i] == "<SUBBEGIN": start_index.append(i) elif list[i] == "<SUBEND": stop_index.append(i) # print(start_index) # [0, 11, 22, 33,... # print(stop_index) # [10, 21, 32, 43... # print(len(start_index)) # 1067 # 重组新列表 new_string result = [] for i in range(len(start_index)): new_list = [] # 每次重组列表重新生成
# 通过切片实现代替for循环
new_list = list[start_index[i]:stop_index[i]] for j in range(start_index[i], stop_index[i]): new_list.append(list[j]) new_string = ''.join(new_list) # 以指定字符串作为分隔符,将 seq 中所有的元素(的字符串表示)合并为一个新的字符串 print(new_string) # 转换为字符串 ''' <SUBBEGINIMSI=1243560615528273;MSISDN=986768559232;VLRLIST=10; CF=CFD-TS10-REG-91986762386238-YES-NO-20-YES-65535-YES-YES-NO-NO-NO-YES-YES-YES-YES-NO; OPTGPRS=3-33-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=2-2-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=1-0-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=4-33-65535-241-33-NONE-0-3-00000000-25-1-0-EPS_APN-NONE-65535-1;CHARGE_GLOBAL=3; ''' # 正则表达式进行匹配 apn_33 = re.findall('OPTGPRS=d-33', new_string) apn_0 = re.findall('OPTGPRS=d-0', new_string) if len(apn_33) > 0 and len(apn_0) > 0: content = re.findall('IMSI=.+MSISDN=d+', new_string) # 正则表达式 返回列表 ['IMSI=1243560615528273;MSISDN=986768559232'] r = ''.join(content) # 将列表结果转换为字符串 'IMSI=1243560615528273;MSISDN=986768559232' result.append(r) # 将字符串写入result中 ['IMSI=1243560615528273;MSISDN=986768559232'] else: pass return result if __name__ == '__main__': process('input_data.txt')
2、代码优化通过类实现
import re class apnInfoFinder(): def __init__(self, input_data): self.file = input_data self.msglist = [] self.start_index = [] self.stop_index = [] self.result = [] # 方法1: 对文件进行处理,返回可读取的列表 def getMsgList(self): with open(self.file) as f: message = f.readlines() for i in range(len(message)): self.msglist.append(message[i].strip(' ').strip(' ')) # ['<SUBBEGIN', 'IMSI=1243560615528273;', 'MSISDN=986768559232;', return self.msglist # 方法2 对列表进行拆分,获取子列表的起始索引 def getNewList(self, list): for i in range(len(list)): if list[i] == "<SUBBEGIN": self.start_index.append(i) elif list[i] == "<SUBEND": self.stop_index.append(i) return self.start_index, self.stop_index # 方法3: 重组新列表并进行匹配查找 def getFinder(self, lenlist, list): for i in range(len(lenlist)): new_list = [] # 每次重组列表重新生成 for j in range(self.start_index[i], self.stop_index[i]): new_list.append(list[j]) new_string = ''.join(new_list) # 以指定字符串作为分隔符,将 seq 中所有的元素(的字符串表示)合并为一个新的字符串 # print(new_string) # 转换为字符串 ''' <SUBBEGINIMSI=1243560615528273;MSISDN=986768559232;VLRLIST=10; CF=CFD-TS10-REG-91986762386238-YES-NO-20-YES-65535-YES-YES-NO-NO-NO-YES-YES-YES-YES-NO; OPTGPRS=3-33-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=2-2-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=1-0-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=4-33-65535-241-33-NONE-0-3-00000000-25-1-0-EPS_APN-NONE-65535-1;CHARGE_GLOBAL=3; ''' # 正则表达式进行匹配 apn_33 = re.findall('OPTGPRS=d-33', new_string) apn_0 = re.findall('OPTGPRS=d-0', new_string) if len(apn_33) > 0 and len(apn_0) > 0: content = re.findall('IMSI=.+MSISDN=d+', new_string) # 正则表达式 返回列表 ['IMSI=1243560615528273;MSISDN=986768559232'] r = ''.join(content) # 将列表结果转换为字符串 'IMSI=1243560615528273;MSISDN=986768559232' self.result.append(r) # 将字符串写入result中 ['IMSI=1243560615528273;MSISDN=986768559232'] else: pass return self.result # 方法4: 结果输出为excel文件 def outPut(self): self.getMsgList() self.getNewList(self.msglist) self.getFinder(self.start_index, self.msglist) with open('output.csv', 'w') as out_result: for line in self.result: out_result.writelines(line + ' ') if __name__ == '__main__': a = apnInfoFinder('input_data.txt') a.outPut()