所有操作,假定hadoop集群已经正常部署。
Python源码
mapper.py
#!/usr/bin python
import sys
# input comes from STDIN (standard input)
for line in sys.stdin:
line = line.strip()
words = line.split()
for word in words:
print '%s\\t%s' % (word, 1)
reduce.py
#!/usr/bin python
from operator import itemgetter
import sys
word2count = {}
# input comes from STDIN
for line in sys.stdin:
line = line.strip()
word, count = line.split('\\t', 1)
try:
count = int(count)
word2count[word] = word2count.get(word, 0) + count
except ValueError:
# count was not a number, so silently
# ignore/discard this line
pass
sorted_word2count = sorted(word2count.items(), key=itemgetter(0))
for word, count in sorted_word2count:
print '%s\\t%s'% (word, count)