• 数据去重优化 MemoryError 内存不足


    from ProjectUtil.usingModuleTOMODIFY import getNow
    
    export_q_f, q_l, start_ = '/mnt/mongoexport/superpub-ask-question.csv', [], getNow()
    
    
    def save_(q_l):
        export_q_f = '/mnt/mongoexport/superpub-ask-question-cleaned-NUM-{}.txt'.format(len(q_l))
        with open(export_q_f, 'w', encoding='utf-8') as fw:
            s = '
    '.join(q_l)
            fw.write(s)
    
    
    step = 500000
    with open(export_q_f, 'r', encoding='utf-8') as fr:
        c = 0
        for i in fr:
            question = i.rstrip('
    ')
            c += 1
            q_l.append(question)
            if c % step == 0:
                q_l = [i for i in set(q_l)]
                print(step, ':', c / step, 'start_', start_, 'now', getNow(), 'DistinctNum', len(q_l))
                save_(q_l)
    

      

    500000 : 1.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:05 DistinctNum 270513
    500000 : 2.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:07 DistinctNum 539468
    500000 : 3.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:09 DistinctNum 804547
    500000 : 4.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:10 DistinctNum 1073529
    500000 : 5.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:14 DistinctNum 1342413
    500000 : 6.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:17 DistinctNum 1616368
    500000 : 7.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:20 DistinctNum 1888643
    500000 : 8.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:24 DistinctNum 2159613
    500000 : 9.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:28 DistinctNum 2433085
    500000 : 10.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:32 DistinctNum 2705454
    500000 : 11.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:37 DistinctNum 2978046
    500000 : 12.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:43 DistinctNum 3244211
    500000 : 13.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:50 DistinctNum 3512526
    500000 : 14.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:56 DistinctNum 3782082
    500000 : 15.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:02 DistinctNum 4054694
    500000 : 16.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:09 DistinctNum 4325960
    500000 : 17.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:19 DistinctNum 4595687
    500000 : 18.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:26 DistinctNum 4870389
    500000 : 19.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:35 DistinctNum 5144203
    500000 : 20.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:46 DistinctNum 5416514
    500000 : 21.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:56 DistinctNum 5687541
    500000 : 22.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:30:08 DistinctNum 5959566
    500000 : 23.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:30:19 DistinctNum 6235717
    500000 : 24.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:30:31 DistinctNum 6508576
    500000 : 25.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:30:43 DistinctNum 6784810
    500000 : 26.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:30:57 DistinctNum 7057572
    500000 : 27.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:31:10 DistinctNum 7327870
    500000 : 28.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:31:26 DistinctNum 7600230
    500000 : 29.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:31:41 DistinctNum 7874540
    500000 : 30.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:31:58 DistinctNum 8148841
    500000 : 31.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:32:13 DistinctNum 8421791
    500000 : 32.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:32:33 DistinctNum 8695611
    500000 : 33.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:32:48 DistinctNum 8968033
    

      

    500000 : 103.0 start_ 2018-11-29 09:28:04 now 2018-11-29 10:19:54 DistinctNum 28080404
    500000 : 104.0 start_ 2018-11-29 09:28:04 now 2018-11-29 10:20:56 DistinctNum 28349367
    500000 : 105.0 start_ 2018-11-29 09:28:04 now 2018-11-29 10:22:03 DistinctNum 28618117
    500000 : 106.0 start_ 2018-11-29 09:28:04 now 2018-11-29 10:23:07 DistinctNum 28886698
    500000 : 107.0 start_ 2018-11-29 09:28:04 now 2018-11-29 10:24:11 DistinctNum 29157115
    Traceback (most recent call last):
      File "distinctMongoExportQuestion.py", line 23, in <module>
        save_(q_l)
      File "distinctMongoExportQuestion.py", line 10, in save_
        fw.write(s)
    MemoryError
    [root@e selfPlatformAskAnswerProjeect]# ll -ash /mnt/mongoexport/
    total 12G
     12K drwxr-xr-x 2 root root  12K Nov 29 10:24 .
    4.0K drwxr-xr-x 8 root root 4.0K Nov 26 10:01 ..
    1.4G -rw-r--r-- 1 root root 1.4G Nov 29 10:16 superpub-ask-question-cleaned-NUM-27004655.txt
    1.4G -rw-r--r-- 1 root root 1.4G Nov 29 10:17 superpub-ask-question-cleaned-NUM-27272026.txt
    1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:18 superpub-ask-question-cleaned-NUM-27537864.txt
    1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:19 superpub-ask-question-cleaned-NUM-27809291.txt
    1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:20 superpub-ask-question-cleaned-NUM-28080404.txt
    1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:21 superpub-ask-question-cleaned-NUM-28349367.txt
    1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:22 superpub-ask-question-cleaned-NUM-28618117.txt
    1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:23 superpub-ask-question-cleaned-NUM-28886698.txt
       0 -rw-r--r-- 1 root root    0 Nov 29 10:24 superpub-ask-question-cleaned-NUM-29157115.txt
    [root@e selfPlatformAskAnswerProjeect]#
    

      

    修改代码

    from ProjectUtil.usingModuleTOMODIFY import getNow
    
    export_q_f, q_l, start_ = '/data/bigdata/mongoexport/superpub-ask-question.csv', [], getNow()
    
    step = 500000
    with open(export_q_f, 'r', encoding='utf-8') as fr:
        c = 0
        for i in fr:
            question = i.rstrip('
    ')
            c += 1
            q_l.append(question)
            if c % step == 0:
                q_l = [i for i in set(q_l)]
                print(step, ':', c / step, 'start_', start_, 'now', getNow(), 'DistinctNum', len(q_l))
                export_q_f = '/data/bigdata/mongoexport/superpub-ask-question-cleaned-NUM-{}.txt'.format(len(q_l))
                with open(export_q_f, 'w', encoding='utf-8') as fw:
                    s = '
    '.join(q_l)
                    fw.write(s)
    

      更换主机为16G--->32G(开启进程前,内存消耗约5G)

    6核--->同规格8核(之前cpu消耗情况未统计)

     cat /proc/cpuinfo

    processor : 7
    vendor_id : GenuineIntel
    cpu family : 6
    model : 63
    model name : Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
    stepping : 2
    microcode : 0x1
    cpu MHz : 2494.224
    cache size : 30720 KB
    physical id : 0
    siblings : 8
    core id : 3
    cpu cores : 4
    apicid : 7
    initial apicid : 7
    fpu : yes
    fpu_exception : yes
    cpuid level : 13
    wp : yes
    flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm xsaveopt fsgsbase bmi1 avx2 smep bmi2 erms invpcid
    bogomips : 4988.44
    clflush size : 64
    cache_alignment : 64
    address sizes : 46 bits physical, 48 bits virtual
    power management:

    500000 : 96.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:40:37 DistinctNum 26197155
    500000 : 97.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:41:19 DistinctNum 26466813
    500000 : 98.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:42:03 DistinctNum 26737397
    500000 : 99.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:42:45 DistinctNum 27005103
    500000 : 100.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:43:28 DistinctNum 27272487
    500000 : 101.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:44:10 DistinctNum 27538331
    500000 : 102.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:44:55 DistinctNum 27809771
    500000 : 103.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:45:38 DistinctNum 28080901
    500000 : 104.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:46:24 DistinctNum 28349871
    500000 : 105.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:47:10 DistinctNum 28618630
    500000 : 106.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:47:56 DistinctNum 28887233
    500000 : 107.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:48:43 DistinctNum 29157679
    500000 : 108.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:49:33 DistinctNum 29420209
    500000 : 109.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:50:21 DistinctNum 29675048
    500000 : 110.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:51:10 DistinctNum 29934499
    500000 : 111.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:51:59 DistinctNum 30193756
    500000 : 112.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:52:50 DistinctNum 30453618
    500000 : 113.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:53:40 DistinctNum 30712426
    500000 : 114.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:54:31 DistinctNum 30972908
    500000 : 115.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:55:25 DistinctNum 31234766
    500000 : 116.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:56:18 DistinctNum 31495613
    500000 : 117.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:57:13 DistinctNum 31756776
    

      

    500000 : 152.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:35:05 DistinctNum 40981071
    500000 : 153.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:36:20 DistinctNum 41243684
    500000 : 154.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:37:40 DistinctNum 41511378
    500000 : 155.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:38:57 DistinctNum 41777831
    500000 : 156.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:40:16 DistinctNum 42043333
    500000 : 157.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:41:33 DistinctNum 42308552
    500000 : 158.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:42:49 DistinctNum 42568225
    500000 : 159.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:44:06 DistinctNum 42818269
    500000 : 160.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:45:24 DistinctNum 43069718
    500000 : 161.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:46:42 DistinctNum 43322396
    500000 : 162.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:48:06 DistinctNum 43573573
    500000 : 163.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:49:23 DistinctNum 43826414
    500000 : 164.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:50:42 DistinctNum 44079373
    500000 : 165.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:52:01 DistinctNum 44335042
    500000 : 166.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:53:22 DistinctNum 44593450
    500000 : 167.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:54:46 DistinctNum 44854064
    500000 : 168.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:56:11 DistinctNum 45115737
    500000 : 169.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:57:36 DistinctNum 45378583
    500000 : 170.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:58:58 DistinctNum 45638980
    500000 : 171.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:00:23 DistinctNum 45902750
    500000 : 172.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:01:46 DistinctNum 46163054
    500000 : 173.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:03:12 DistinctNum 46212601
    500000 : 174.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:04:37 DistinctNum 46240277
    500000 : 175.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:06:02 DistinctNum 46269660
    500000 : 176.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:07:28 DistinctNum 46317443
    500000 : 177.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:08:54 DistinctNum 46492828
    Traceback (most recent call last):
      File "distinctMongoExportQuestion.py", line 17, in <module>
        s = '
    '.join(q_l)
    MemoryError
    

      

    磁盘监控程序

    import os, time
    
    while True:
        s = 'find ./* -mmin +3 | grep txt | xargs rm -f'
        print(s)
        os.system(s)
        time.sleep(120)
    

      

    但是 没有对监控的监控,导致,数据都被删除了。。。。。

  • 相关阅读:
    c语言I博客作业02
    第六次作业
    第五次作业
    第四次作业
    第三次作业
    第二周作业
    第一周作业
    《面向学科竞赛的实验室信息管理体系构建》文献阅读随笔
    《高校学科竞赛管理系统研发与应用》文献阅读随笔
    《网络竞赛系统框架设计与功能模块实现》文献阅读随笔
  • 原文地址:https://www.cnblogs.com/rsapaper/p/10036357.html
Copyright © 2020-2023  润新知