从groupby 理解mapper-reducer

注，reduce之前已经shuff。

http://zheming.wang/blog/2015/05/19/3AFF5BE8-593C-4F76-A72A-6A40FB140D4D/

简单地说，reduce任务在执行之前的工作就是不断地拉取每个map任务的最终结果，然后对从不同地方拉取过来的数据不断地做merge，也最终形成一个文件作为reduce任务的输入文件。下图reducer部分可能会有误导...

mapper.py

#!/usr/bin/env python
"""mapper.py"""

import sys

# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        print '%s	%s' % (word, 1)

reducer.py

#!/usr/bin/env python
"""reducer.py"""

from operator import itemgetter
import sys

current_word = None
current_count = 0
word = None

# input comes from STDIN
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()

    # parse the input we got from mapper.py
    word, count = line.split('	', 1)

    # convert count (currently a string) to int
    try:
        count = int(count)
    except ValueError:
        # count was not a number, so silently
        # ignore/discard this line
        continue

    # this IF-switch only works because Hadoop sorts map output
    # by key (here: word) before it is passed to the reducer
    if current_word == word:
        current_count += count
    else:
        if current_word:
            # write result to STDOUT
            print '%s	%s' % (current_word, current_count)
        current_count = count
        current_word = word

# do not forget to output the last word if needed!
if current_word == word:
    print '%s	%s' % (current_word, current_count)

Improved Mapper and Reducer code: using Python iterators and generators

mapper.py

#!/usr/bin/env python
"""A more advanced Mapper, using Python iterators and generators."""

import sys

def read_input(file):
    for line in file:
        # split the line into words
        yield line.split()

def main(separator='	'):
    # input comes from STDIN (standard input)
    data = read_input(sys.stdin)
    for words in data:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        for word in words:
            print '%s%s%d' % (word, separator, 1)

if __name__ == "__main__":
    main()

reducer.py

#!/usr/bin/env python
"""A more advanced Reducer, using Python iterators and generators."""

from itertools import groupby
from operator import itemgetter
import sys

def read_mapper_output(file, separator='	'):
    for line in file:
        yield line.rstrip().split(separator, 1)

def main(separator='	'):
    # input comes from STDIN (standard input)
    data = read_mapper_output(sys.stdin, separator=separator)
    # groupby groups multiple word-count pairs by word,
    # and creates an iterator that returns consecutive keys and their group:
    #   current_word - string containing a word (the key)
    #   group - iterator yielding all ["&lt;current_word&gt;", "&lt;count&gt;"] items
    for current_word, group in groupby(data, itemgetter(0)):
        try:
            total_count = sum(int(count) for current_word, count in group)
            print "%s%s%d" % (current_word, separator, total_count)
        except ValueError:
            # count was not a number, so silently discard this item
            pass

if __name__ == "__main__":
    main()

相关阅读:
ParksLink修改密码
 ORA-01940:无法删除当前已链接的用户
 imp导入数据的时候报错：ORA-01658: 无法为表空间 MAXDATA 中的段创建 INITIAL 区
 Linux下查看日志用到的常用命令
 大批量数据高效插入数据库表
 线程中断：Thread类中interrupt（）、interrupted（）和 isInterrupted（）方法详解
 CyclicBarrier、CountDownLatch、Callable、FutureTask、thread.join() 、wait()、notify()、Condition
Mysql全文索引
 Docker 镜像的常用操作
 Docker 入门
原文地址：https://www.cnblogs.com/TMatrix52/p/11619614.html

最新文章
swift
iOS 开发
 swift
swift
前端
 Nginx SSL配置
 Nginx 负载均衡
 Nginx 代理
 /var/log/spooler
Linux nmon 命令