-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmapper.py
More file actions
52 lines (38 loc) · 1.35 KB
/
mapper.py
File metadata and controls
52 lines (38 loc) · 1.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python
"""An advanced Mapper, using Python iterators and generators."""
import sys
import re
def read_input(input):
t=0
for line in input:
# spliting the line to words inorder to keep returning each word
line=line.lower()
# replacing all words not in [a-z0-9]with blank space
line=re.sub("[^a-z0-9]"," ",line).split()
l=len(line)
if l < 3:
continue
# Putting all all 2 or 3 word inputs into single string by using commas
for x in range(l-1):
t=line[x]+','+line[x+1]
line.append(t)
for x in range(l-2):
t=line[x]+','+line[x+1]+','+line[x+2]
line.append(t)
yield line
def main(separator='\t'):
# input comes from STDIN (standard input)
data = read_input(sys.stdin)
data=list(data)
count=0
for words in data:
# write the results to STDOUT (standard output);
# what we output here will be the input for the
# Reduce step, i.e. the input for reducer.py
#
# tab-delimited; the trivial word count is 1
for word in words:
print('%s%s%d' % (word, separator, 1))
# how to test locally in bash/linus: cat <input> | python mapper.py
if __name__ == "__main__":
main()