• homework做了些什么?


    第一步:get_new_guid_uid_pairs_{$ymd}

    参数是时间100上的文件.

    那么100上的文件是从哪里来的呢?

    我们进入到100机器上,打开root权限下的cron,看到如下内容:

    ### add by kamilzhou for clickflow system
    8 * * * * /data/clickflow/hourly_data/hourlyGetLogDataStore.sh >> /data1/clickflow/hourly_data/get.log &
    20 * * * * /data/clickflow/filters/correct-page-id-filter/clickflowlog-filter/bin/hourly_refer_filter.sh

    所以我猜测是不是100上的数据是从这两个脚本拉过来的呢?

    查看这俩个脚本内容.

     1 #!/bin/sh
     2 
     3 #获取当前脚本所在路径
     4 currdir=$(cd "$(dirname "$0")"; pwd)
     5 
     6 #进入脚本所在目录
     7 cd $currdir
     8 
     9 log_file_time=`date +%Y%m%d%H -d"-1 hours"`
    10 log_file_dir=`date +%Y%m%d -d"-1 hours"`
    11 
    12 if  [ $# = 1 ]; then
    13     log_file_time=$1
    14 fi
    15 
    16 if  [ $# = 2 ]; then
    17     log_file_time=$1
    18     log_file_dir=$2
    19 fi
    20 
    21 if [ ! -d "/data1/clickflow/by-time/""$log_file_dir" ]; then
    22     mkdir "/data1/clickflow/by-time/""$log_file_dir"
    23 fi
    24 
    25 log_filename="row_data_""${log_file_time}""*"
    26 src_log_filename="/data/clickflow/data_today/""${log_filename}"
    27 
    28 /usr/bin/expect recv_file.exp 10.206.8.75 36000 $src_log_filename "/data1/clickflow/by-time/""$log_file_dir" kamilzhou Zhou2013@tencent
    hourlyGetLogDataStore.sh
     1 #!/bin/sh
     2 
     3 orig_log_dir="/data1/clickflow/by-time/"
     4 refer_filter_dir="/data1/clickflow/refer-filtered/"
     5 
     6 refer_filter_bin="/data/clickflow/filters/correct-page-id-filter/clickflowlog-filter/bin/correct-page-id-filter.sh"
     7 
     8 log_file_time=`date +%Y%m%d%H -d"-1 hours"`
     9 log_file_dir=`date +%Y%m%d -d"-1 hours"`
    10 
    11 if  [ $# = 1 ]; then
    12     log_file_time=$1
    13 fi
    14 
    15 if  [ $# = 2 ]; then
    16     log_file_time=$1
    17     log_file_dir=$2
    18 fi
    19 
    20 src_dir="$orig_log_dir""$log_file_dir""/"
    21 dest_dir="$refer_filter_dir""$log_file_dir""/"
    22 
    23 if [ ! -d "$dest_dir" ]; then
    24     mkdir "$dest_dir"
    25 fi
    26 
    27 src_files="$src_dir""row_data_""$log_file_time""*"
    28 
    29 for file in `ls $src_files`; do
    30     filename=`basename $file`;
    31 
    32     echo $filename
    33     $refer_filter_bin $file "$dest_dir""$filename"
    34 done
    hourly_refer_filter.sh

    hourlyGetLogDataStore.sh这个脚本每小时的8分钟执行,负责把前一个小时的75上的原始数据拉过来.

    hourly_refer_filter.sh 这个脚本每小时的20分钟执行,代码里调用了很多其他脚本.目的是负责过滤不合法的数据.

    详细了解请参考:

    100上的数据过滤

    参数是有了,那么输出是什么呢?

     我们来看下这个任务的handler.

      1 /**
      2  * get new guid uid pairs and paritition the raw clickflow data        获得新的 guid和uid 的配对   raw是未经加工的意思
      3  * useing multi-processes
      4  *
      5  * @param array $request request of a task
      6  * @param boolean &$continued wheater the process should be continued    
      7  *
      8  * @return boolean true when success, otherwise false
      9  * @author baronyuan
     10  */
     11 function getNewGuidUidPairsNPartitionRawDataSMP($in, $out, &$continued)
     12 {
     13     {  //参数检查
     14     if (!is_array($in)                      
     15         || !isset($in['date'])
     16         || !isset($in['rawdata'])
     17         || !is_array($out)
     18         || !isset($out['upairs'])
     19         || !isset($out['rawdata'])
     20     ) {
     21         Logger::err(sprintf(
     22             'invalid request with in[%s] or out[%s]',
     23             json_encode($in), json_encode($out)
     24         ));
     25 
     26         return false;
     27     }
     28     }
     29 
     30     $date = trim($in['date']);   //20140806
     31     $remote_dir = trim($in['rawdata']);   //    sftp://10.191.152.100:36000/data1/clickflow/refer-filtered/20140806/   未经处理的数据(来自100 )
     32     $result_pairs = $out['upairs'];     //      sftp://10.206.8.75:36000/data/clickflow/tmp/guid_uid-map/new/20140806/data //这个应该是临时存放文件夹,因为75上暂时看不见数据
     33     $results_rawdata = pattern_range2array($out['rawdata']);  //   sftp://10.206.8.75:36000/data/clickflow/tmp/rawdata/20140806/by-guid/data_0   ~   sftp://10.206.8.75:36000/data/clickflow/tmp/rawdata/20140806/by-guid/data_99
     34 
     35     $unikey = get_unikey();   //33772
     36 
     37     $resource_prefix =
     38         HOMEWORK_CLICKFLOW_TMP_PATH . 'tasks.' . $unikey . '_input_';       //         /data/tmp/clickflow/homework/tasks.unikey_input_  这个地址应该在75和85 上都有
     39     $output_prefix =
     40         HOMEWORK_CLICKFLOW_TMP_PATH . 'tasks.' . $unikey . '_output_';
     41 
     42     if (RemoteResourceAccessExists($remote_dir . 'row_data_' . $date . '000')) {   //正常情况下 ,这个条件是 成立的  row_data_20140806000
     43         $remote_resource_names = getSplitedRawDataNames($date);      //这个地方20140806000~20140806235
     44         $tasks_input = array();
     45 
     46         foreach ($remote_resource_names as $key => $name) {
     47             $resource_local = $resource_prefix . $key;               // HOMEWORK_CLICKFLOW_TMP_PATH . 'tasks.' . $unikey . '_input_0' 对应 row_data_20140806000
     48             $remote = $remote_dir . 'row_data_' . $name;            //sftp://10.191.152.100:36000/data1/clickflow/refer-filtered/20140806/row_data_20140806235
     49 
     50             if (!RemoteResourceAccessExists($remote)) {                  // 查看100上原始文件  是否存在
     51                 Logger::err(sprintf('%s not found', $remote));
     52                 return false;
     53             }
     54 
     55             $get_result = RemoteResourceAccessGetUntil(        //把100上的文件拿过来,放在本地
     56                 $remote_dir . 'row_data_' . $name,
     57                 $resource_local,                                //存放在本地的临时文件
     58                 $continued
     59             );
     60 
     61             if (!$continued) {
     62                 Logger::err('stop!');
     63                 return false;
     64             }
     65 
     66             FileAutoCleaner::add($resource_local);
     67             $tasks_input[] = $resource_local;                           //把拿过来的文件 存到一个数组里面
     68         }
     69     } else {
     70         $remote_resource_name = $remote_dir . 'row_data_' . $date;
     71 
     72         if (!RemoteResourceAccessExists($remote_resource_name)) {
     73             Logger::err(sprintf('%s not found', $remote_resource_name));
     74             return false;
     75         }
     76 
     77         $resource_local = $resource_prefix . 'inall';
     78         $get_result = RemoteResourceAccessGetUntil(
     79             $remote_resource_name,
     80             $resource_local,
     81             $continued
     82         );
     83 
     84         if (!$continued) {
     85             Logger::err('stop!');
     86             return false;
     87         }
     88 
     89         FileAutoCleaner::add($resource_local);
     90 
     91         if (false === split_file($resource_local, 144, $resource_prefix)) {
     92             Logger::err(sprintf(
     93                 'failed to split the file[%s]', $resource_prefix
     94             ));
     95 
     96             return false;
     97         }
     98 
     99         for ($i = 0; $i < 144; ++$i) {
    100             $tasks_input[] = $resource_prefix . $i;
    101         }
    102 
    103         FileAutoCleaner::add($tasks_input);
    104     }
    105 
    106     $tasks_output = array(                            //输出数组???、、。。。
    107         $output_prefix . 'pairs.output',
    108         pattern_range2array(
    109             $output_prefix . 'rawdata_[0-99]'
    110         )
    111     );
    112 
    113     file_put_contents($tasks_output[0], '');
    114 
    115     foreach ($tasks_output[1] as $task_rawdata_output) {
    116         file_put_contents($task_rawdata_output, '');
    117     }
    118 
    119     FileAutoCleaner::add($tasks_output[0], $tasks_output[1]);
    120 
    121     $num_of_processes = 40;
    122     $num_of_tasks = count($tasks_input);    // 从100上过来多少个文件  就有多少子任务
    123     $counter = -1;
    124 
    125     $task_divider = new TaskDivider();
    126 
    127     while ($num_of_tasks > 0) {
    128         if ($num_of_tasks < $num_of_processes) {
    129             $num_of_processes = $num_of_tasks;
    130         }
    131 
    132         $task_divider->clear();
    133 
    134         for ($i = 0; $i < $num_of_processes; ++ $i) {
    135             ++ $counter;
    136 
    137             $task_divider->add(
    138                 'getNewGuidUidPairsNPartitionRawDataTask',
    139                 array(
    140                     $tasks_input[$counter],
    141                     $tasks_output[0],
    142                     $tasks_output[1]
    143                 )
    144             );
    145         }
    146 
    147         $run_result = $task_divider->run($continued);
    148 
    149         if (false === $run_result) {
    150             Logger::err('fail to run tasks');
    151             return false;
    152         } elseif ($run_result !== 0) {
    153             Logger::err(sprintf('fail to run %d tasks', $run_result));
    154             return false;
    155         } else {
    156             // nothing to do ...
    157         }
    158 
    159         $num_of_tasks -= $num_of_processes;
    160     }
    161 
    162     $uniq_result = uniq_file($tasks_output[0]);
    163 
    164     if (false === $uniq_result) {
    165         Logger::err(sprintf('fail to uniq the file[%s]', $tasks_output[0]));
    166         return false;
    167     }
    168 
    169     $put_result = RemoteResourceAccessPutUntil(
    170         $tasks_output[0],
    171         $result_pairs,
    172         $continued
    173     );
    174 
    175     if (!$continued) {
    176         Logger::err('stop!');
    177         return false;
    178     }
    179 
    180     foreach ($tasks_output[1] as $key => $task_rawdata_output) {
    181         $local_path = $task_rawdata_output;
    182         $remote_path = $results_rawdata[$key];
    183 
    184         $put_result = RemoteResourceAccessPutUntil(
    185             $local_path,
    186             $remote_path,
    187             $continued
    188         );
    189 
    190         if (!$continued) {
    191             Logger::err('stop!');
    192             return false;
    193         }
    194     }
    195 
    196     return true;
    197 }
    getNewGuidUidPairsNPartitionRawDataSMP
      1 /**
      2  * get new guid uid pairs sub task for a single process to handle
      3  *
      4  * @param string $input_file file to handle
      5  * @param string $output_pairs_file output of pairs
      6  * @param array $output_rawdata_files output of rawdata partitioned
      7  *
      8  * @return boolean true when success, otherwise failed
      9  * @author baronyuan
     10  */
     11 function getNewGuidUidPairsNPartitionRawDataTask(
     12     $input_file,
     13     $output_pairs_file,
     14     $output_rawdata_files
     15 ) {
     16     global $logger;
     17     $logger->uninit();
     18     $logger = new Logger(HOMEWORK_WORKER_LOG_NAME);
     19 
     20     $fp_input = @fopen($input_file, 'r');     //这个地方打开的应该是 已经从100上拿来的 存在本地的临时文件
     21 
     22     if (false === $fp_input) {
     23         Logger::err(sprintf('failed to open file[%s]', $input_file));
     24         return false;
     25     }
     26 
     27     $upairs = array();
     28     $fps_rawdata_output = array();
     29 
     30     for ($i = 0; $i < 100; ++ $i) {
     31         $fp_rawdata_output =
     32             $fps_rawdata_output[] =                           //这个地方打开了100个文件,存在一个数组里,准备往里面写
     33                 @fopen($output_rawdata_files[$i], 'a');   // a 以附加的方式打开只写文件。若文件不存在,则会建立该文件,如果文件存在,写入的数据会被加到文件尾,即文件原先的内容会被保留。
     34 
     35         if (false === $fp_rawdata_output) {
     36             Logger::err(
     37                 sprintf('fail to open file[%s]', $output_rawdata_files[$i])
     38             );
     39 
     40             return false;
     41         }
     42     }
     43 
     44     $special_key = -1;
     45 
     46     while (($line = fgets($fp_input, 10240)) !== false) {
     47         $cells = explode("	", $line);
     48 
     49         if (!isset($cells[LOG_FIELD_MAX_IDX])
     50             || $cells[LOG_FIELD_IDX_LOG_TYPE] != LOG_TYPE_PV
     51             || $cells[LOG_FIELD_IDX_PAGE_ID] < 0
     52         ) {
     53             continue;
     54         }
     55 
     56         // iframe
     57         $page_id = (int)$cells[LOG_FIELD_IDX_PAGE_ID];
     58         $wh_id = (int)$cells[LOG_FIELD_IDX_WH_ID];
     59 
     60         if ($wh_id != 1971) {
     61             if ((12376 == $page_id) || (1237017 == $page_id)
     62                 || (63786 == $page_id) || (6378017 == $page_id)
     63                 || (63796 == $page_id) || (6379017 == $page_id)
     64                 || (63806 == $page_id) || (6380017 == $page_id)
     65                 || (63816 == $page_id) || (6381017 == $page_id)
     66                 || (63826 == $page_id) || (6382017 == $page_id)
     67                 || (5020 == $page_id) || (11718320 == $page_id)
     68                 || (!(int)$cells[LOG_FIELD_IDX_PAGE_LEVEL] &&
     69                 !preg_match('#buy.(?:51buy|yixun).com#', $cells[LOG_FIELD_IDX_PAGE_URL]))
     70             ) {
     71                 continue;
     72             }
     73         }
     74 
     75         $uid = (int)$cells[LOG_FIELD_IDX_USER_ID];
     76         $guid = trim($cells[LOG_FIELD_IDX_GUID]);
     77 
     78         $hash_key = null;
     79 
     80         // tmp hack
     81         if (!in_array($wh_id, array(1, 1001, 2001, 3001, 4001, 5001, 1969, 1971))) {
     82             continue;
     83         }
     84 
     85         if (1970 == $wh_id) {
     86             $hash_key = $special_key = (++ $special_key) % 100;
     87         } else {
     88             $hash_key = crc32($guid) % 100;
     89 
     90             if ($uid && $guid && $guid != 'null' && $guid != 'Unknown') {
     91                 $upairs[$guid] = $uid;
     92             }
     93         }
     94 
     95         if ($hash_key < 0 || $hash_key > 99) {
     96             Logger::err(sprintf('invalid hash key %d', $hash_key));
     97         }
     98 
     99         $write_result = @fwrite($fps_rawdata_output[$hash_key], $line);
    100 
    101         if (false === $write_result) {
    102             Logger::err('failed to write a line into a file');
    103             return false;
    104         }
    105     }
    106 
    107     if (!feof($fp_input)) {
    108         Logger::err(sprintf('failed to open file[%s]', $input_file));
    109         return false;
    110     }
    111 
    112     fclose($fp_input);
    113 
    114     foreach ($fps_rawdata_output as $fp_rawdata_output) {
    115         @fclose($fp_rawdata_output);
    116     }
    117 
    118     $fp_pairs_output = @fopen($output_pairs_file, 'a');
    119 
    120     if (false === $fp_pairs_output) {
    121         return false;
    122     }
    123 
    124     foreach ($upairs as $guid => $uid) {
    125         $write_result = fwrite($fp_pairs_output, "{$guid}	{$uid}
    ");
    126 
    127         if (false === $write_result) {
    128             Logger:err(
    129                 sprintf(
    130                     'failed to write a line into file[%s]',
    131                     $output_pairs_file
    132                 )
    133             );
    134 
    135             return false;
    136         }
    137     }
    138 
    139     fclose($fp_pairs_output);
    140     return true;
    141 }
    getNewGuidUidPairsNPartitionRawDataTask
  • 相关阅读:
    引爆点--产品方法论
    智能的差旅预订 竞品分析
    差旅管理
    运营中的用户心理学
    鸡汤
    励志的鸡汤
    cmder安装
    jQuery height() innerHeight() outerHight() width() innerWidth() outerWidth()源码解读
    css未知宽度水平居中整理
    css水平垂直居中块整理
  • 原文地址:https://www.cnblogs.com/victorruan/p/3964337.html
Copyright © 2020-2023  润新知