#!/usr/bin/env python
from operator import itemgetter
import sys
current_word = None
current_count = 0
word = None
for line in sys.stdin:
words = line.strip()
word, count = words.split('\t')
try:
count = int(count)
except ValueError:
continue
if current_word == word:
current_count += count
else:
if current_word:
print '%s\t%s' %(current_word, current_count)
current_count = count
current_word = word
if current_word == word:
print '%s\t%s' %(current_word, current_count)
建立了两个代码之后,测试一下:
[qiu.li@l-tdata5.tkt.cn6 /export/python]$ echo "I like python hadoop , hadoop very good" | ./mapper.py | sort -k 1,1 | ./reducer.py
,
1
good
1
hadoop
2
I
1
like
1
python
1
very
1
[qiu.li@l-tdata5.tkt.cn6 /export/python]$ hadoop dfs -cat /user/ticketdev/tmp/output/part-00000 | sort -nk 2 | tail
DEPRECATED: Use of
this script to execute hdfs command is deprecated.
Instead use the hdfs command
for it.