-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreducer2.py
More file actions
61 lines (49 loc) · 2.52 KB
/
reducer2.py
File metadata and controls
61 lines (49 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
"""An advanced Reducer, using Python iterators and generators."""
from itertools import groupby
from operator import itemgetter
import sys
import re
import json
# receive the output of a mapper, (key, [value, value, ...])
def read_mapper_output(input, separator='\t'):
for line in input:
# return each (key, [value, value, ...]) tuple, though there should only be one per line
yield line.rstrip().split(separator, 1)
def main(separator='\t'):
# input comes from STDIN (standard input)
data = read_mapper_output(sys.stdin, separator=separator)
unigram_count = 0
trigram_wordlist=[]
unigram_wordlist=[]
bigram_wordlist=[]
# Harcoded in the mapper to create only one key with different values, each value being a different word
print('unigram bigram trigram')
for current_word, group in groupby(data, itemgetter(0)):
try:
for current, w in group:
w=json.loads(w)
if len(re.findall(',',w[0]))==2:
trigram_wordlist.append(w)
elif len(re.findall(',',w[0]))==1:
bigram_wordlist.append(w)
else:
unigram_wordlist.append(w)
unigram_count=int(w[1]) + unigram_count
# Finding the number of maximum ocuurence of trigrams.
maximum_len = max(len(trigram_wordlist), len(bigram_wordlist), len(unigram_wordlist))
minimum_len = min(len(trigram_wordlist), len(bigram_wordlist), len(unigram_wordlist))
diff= maximum_len-minimum_len
val_add =["----","0"]
# Printing the unigram , Bigram , Trigram
for x in range(diff):
unigram_wordlist.append(val_add)
bigram_wordlist.append(val_add)
trigram_wordlist.append(val_add)
for x in range(maximum_len):
print('%s%s%s%s%s%s%s%s%s%s%s' % (unigram_wordlist[x][0].replace(',',' '), separator, unigram_wordlist[x][1],separator,bigram_wordlist[x][0].replace(',',' '),separator,bigram_wordlist[x][1],separator,trigram_wordlist[x][0].replace(',',' '),separator,trigram_wordlist[x][1]))
except ValueError:
# count was not a number, so silently discard this item
pass
if __name__ == "__main__":
main()