代码如下:
import random import datetime import time dataCount = 10*100*100 #10M. codeRange = range(ord('a'),ord('z')) alphaRange = [chr(x) for x in codeRange] alphaMax = len(alphaRange) daysMax = 42003 theDay = datetime.date(1900,1,1) def genRandomName(nameLength): global alphaRange,alphaMax length = random.randint(1, nameLength) name = '' for i in range(length): name += alphaRange[random.randint(0,alphaMax-1)] return name def genRandomDay(): global daysMax,theDay mDays = random.randint(0,daysMax) mDate = theDay + datetime.timedelta(days=mDays) return mDate.isoformat() def genRandomSex(): return random.randint(0,1) def genDataBase1(fileName,dataCount): outp = open(fileName,'w') i = 0 while i<dataCount: firstName = genRandomName(14) lastName = genRandomName(14) birthday = genRandomDay() sex = genRandomSex() mLine = "%i %s %s %s %d "%(i+1,firstName,lastName,birthday,sex) outp.write(mLine) i += 1 outp.close() if __name__ == "__main__": random.seed() start = time.time() genDataBase1('db_test.txt',dataCount) end = time.time() print('use time:%d'%(end-start)) print('Ok')
生成数据格式:
1 wkhmjprutxovs bhlt 1999-07-16 0
2 q mwvvjmpdlmk 1984-11-01 0
3 jqpaxktiudjta rrxxiba 1903-05-23 0
4 moqedxba v 1951-07-11 0
5 gjalleufxt rsdoneumcgbmo 1900-08-30 0
6 hrtfx d 1948-09-11 0
7 iomxbjrywau aure 1993-11-16 0
8 rxhqatkq fvcsqhpogmenud 1979-01-01 1
9 xrqcwhvh ucbd 1976-06-14 1
10 sgurlwao au 1989-04-30 1
11 g vfb 1992-10-07 0
12 yyfatwh ibwfdfdqnpbeau 1955-09-18 1
13 xubjawbdkgx rjaocwemvvgj 1905-10-14 0
14 bdwkgvkkuok bgjfffekqy 1931-03-12 1
15 ckv itqdy 1963-11-11 0
16 auwwabbc luipbejel 1984-05-06 0
17 mefykukxwodhm iiilfjxjpqq 2005-04-23 1
总结:
一千万条数据 生产时间大概需要490秒。占用内存320m左右 机器配置 8g内存 i3处理器
用grep 查询其中一条数据大概需要1到2秒 统计五千条数据需要 1到2秒时间,说明 grep功能强大
一亿条数据
用grep 查询其中一条数据大概需要40秒 统计五千条数据需要 40秒时间,时间指数增加,生产数据时间是4876秒(可优化)
用sed把一个字符串替换为另外一个字符串 需要4分钟 替换的数据了为649747 十万级别的