#!/usr/bin/env python
import sys
for line in sys.stdin:
line = line.strip()
words = line.split()
for word in words:
print "%s\t%s" % (word, 1)
文件从STDIN读取文件。把单词切开,并把单词和词频输出STDOUT。Map脚本不会计算单词的总数,而是输出 1。在我们的例子中,我们让随后的Reduce阶段做统计工作。
为了是脚本可执行,增加mapper.py的可执行权限
#!/usr/bin/env python
import sys
def read_input(file):
for line in file:
yield line.split()
def main(separator='\t'):
data = read_input(sys.stdin)
for words in data:
for word in words:
print "%s%s%d" % (word, separator, 1)
if __name__ == "__main__":
main() reducer.py
#!/usr/bin/env python
from operator import itemgetter
from itertools import groupby
import sys
def read_mapper_output(file, separator = '\t'):
for line in file:
yield line.rstrip().split(separator, 1)
def main(separator = '\t'):
data = read_mapper_output(sys.stdin, separator = separator)
for current_word, group in groupby(data, itemgetter(0)):
try:
total_count = sum(int(count) for current_word, count in group)
print "%s%s%d" % (current_word, separator, total_count)
except valueError:
pass
if __name__ == "__main__":
main()
细节:groupby
from itertools import groupby
from operator import itemgetter
things = [('2009-09-02', 11),
('2009-09-02', 3),
('2009-09-03', 10),
('2009-09-03', 4),
('2009-09-03', 22),
('2009-09-06', 33)]
sss = groupby(things, itemgetter(0))
for key, items in sss:
print key
for subitem in items:
print subitem
print '-' * 20
结果