1 import os
2 import subprocess
3 from . import hdmc
4 from hdmc import hdfs
5 from . hdmc.code_generator import CodeGenerator
6
7 tmp_directory = "/tmp"
8 __all__ = ["sort_numeric", "sort_ascii", "search", "histogram"]
9
10
12 c = CodeGenerator()
13 c.begin()
14 c.write("#! /usr/bin/env python\n\n")
15 c.write("import sys\n\n")
16 c.write("def read_input(file):\n")
17 c.indent()
18 c.write("for line in file:\n")
19 c.indent()
20 c.write("yield line.strip()\n")
21 c.dedent()
22 c.dedent()
23 c.write("\n\n")
24 c.write("def main():\n")
25 c.indent()
26 c.write("data = read_input(sys.stdin)\n")
27 c.write("for line in data:\n")
28 c.indent()
29 c.write("if len(line)>0:\n")
30 c.indent()
31 c.write("print 'ValueHistogram:'+line+':'+'1'\n")
32 c.dedent()
33 c.dedent()
34 c.dedent()
35 c.write("\n\n")
36 c.write('if __name__ == \"__main__\":\n')
37 c.indent()
38 c.write("main()\n")
39 c.write("\n")
40 frame_file = open("histogram_frame.py", "w")
41 print >> frame_file, c.end()
42 frame_file.close()
43 os.system("chmod a+rwx histogram_frame.py")
44
46 c = CodeGenerator()
47 c.begin()
48 c.write("#! /usr/bin/env python\n\n")
49 c.write("from collections import defaultdict\n")
50 c.write("import sys\n\n")
51 c.write("def read_input(file):\n")
52 c.indent()
53 c.write("for line in file:\n")
54 c.indent()
55 c.write("yield line.strip()\n")
56 c.dedent()
57 c.dedent()
58 c.write("\n\n")
59 c.write("def main():\n")
60 c.indent()
61 c.write("data = read_input(sys.stdin)\n")
62 c.write("c = defaultdict(int)\n")
63 c.write("for line in data:\n")
64 c.indent()
65 c.write("if len(line)>0:\n")
66 c.indent()
67 c.write("c[line] += 1\n")
68 c.dedent()
69 c.dedent()
70 c.write("for key in c.keys():\n")
71 c.indent()
72 c.write("print 'ValueHistogram:'+key+':'+str(c[key])\n")
73 c.dedent()
74 c.dedent()
75 c.write("\n\n")
76 c.write('if __name__ == \"__main__\":\n')
77 c.indent()
78 c.write("main()\n")
79 c.write("\n")
80 frame_file = open("histogram_frame.py", "w")
81 print >> frame_file, c.end()
82 frame_file.close()
83 os.system("chmod a+rwx histogram_frame.py")
84
86 c = CodeGenerator()
87 c.begin()
88 c.write("#! /usr/bin/env python\n\n")
89 c.write("import sys\n\n")
90 c.write("def read_input(file, pattern):\n")
91 c.indent()
92 c.write("line_count = 0\n")
93 c.write("for line in file:\n")
94 c.indent()
95 c.write("line_count += 1\n")
96 c.write("if pattern in line.strip():\n")
97 c.indent()
98 c.write("yield line.strip(), line_count\n")
99 c.dedent()
100 c.dedent()
101 c.dedent()
102 c.write("\n\n")
103 c.write("def main():\n")
104 c.indent()
105 c.write("search_pattern = open('ziggy_search').readline().strip()\n")
106 c.write("data = read_input(sys.stdin, search_pattern)\n")
107 c.write("for filename, count in data:\n")
108 c.indent()
109 if suppress_lines:
110 c.write("print filename.split(':')[0] + ':' + str(count)\n")
111 else:
112 c.write("print filename +':' + str(count)\n")
113 c.dedent()
114 c.dedent()
115 c.write("\n\n")
116 c.write('if __name__ == \"__main__\":\n')
117 c.indent()
118 c.write("main()\n")
119 c.write("\n")
120 frame_file = open("search_frame.py", "w")
121 print >> frame_file, c.end()
122 frame_file.close()
123 os.system("chmod a+rwx search_frame.py")
124
126 c = CodeGenerator()
127 c.begin()
128 c.write("#! /usr/bin/env python\n\n")
129 c.write("import sys\n\n")
130 c.write("def read_input(file):\n")
131 c.indent()
132 c.write("for line in file:\n")
133 c.indent()
134 c.write("yield line.strip()\n")
135 c.dedent()
136 c.dedent()
137 c.write("\n\n")
138 c.write("def main():\n")
139 c.indent()
140 c.write("data = read_input(sys.stdin)\n")
141 c.write("for line in data:\n")
142 c.indent()
143 c.write("if len(line)>0:\n")
144 c.indent()
145 c.write("print line\n")
146 c.dedent()
147 c.dedent()
148 c.dedent()
149 c.write("\n\n")
150 c.write('if __name__ == \"__main__\":\n')
151 c.indent()
152 c.write("main()\n")
153 c.write("\n")
154
155 frame_file = open("identity_frame.py", "w")
156 print >> frame_file, c.end()
157 frame_file.close()
158 os.system("chmod a+rwx identity_frame.py")
159
160 -def sort_numeric(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
173
174 -def sort_ascii(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
175 '''Use MapReduce to sort a large set of ASCII values.'''
176 hdfs.copyToHDFS(input_file, input_file.split("/")[-1])
177 make_identity_frame()
178 if ascending:
179 keycomp = "r"
180 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers, keycomp)
181 else:
182 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers)
183 print hadoop_call
184 hdmc.execute_and_wait(hadoop_call)
185 hdfs.copyFromHDFS(output_file, output_file)
186 cleanup()
187
188 -def search(input_directory, input_files, output_file, search_pattern, suppress_lines=True, num_mappers=None, num_reducers=None):
189 '''Use MapReduce to search a collection of input files.'''
190
191 hdfs.mkdir(input_directory)
192
193 for f in input_files:
194
195 tmpfile = open(tmp_directory+"/ziggy_search_tmp", "w")
196 original_file = open(f)
197 for line in original_file:
198 print >> tmpfile, f +":"+line
199 tmpfile.close()
200 original_file.close()
201 hdfs_location = input_directory+"/"+f.split("/")[-1]
202 hdfs.copyToHDFS(tmp_directory+"/ziggy_search_tmp", hdfs_location)
203 os.remove(tmp_directory+"/ziggy_search_tmp")
204
205 os.system("echo " + search_pattern + " > ziggy_search")
206
207 make_search_frame(suppress_lines)
208
209 make_identity_frame()
210
211 hadoop_call = hdmc.build_generic_hadoop_call("search_frame.py", "identity_frame.py", input_directory, output_file, ["./ziggy_search"], num_mappers, num_reducers)
212 hdmc.execute_and_wait(hadoop_call)
213 hdfs.copyFromHDFS(output_file, output_file)
214 cleanup()
215
216 -def histogram(input_file, output_file, num_mappers=None, num_reducers=None):
226
228 if os.path.isfile("identity_frame.py"):
229 os.remove("identity_frame.py")
230 if os.path.isfile("histogram_frame.py"):
231 os.remove("histogram_frame.py")
232 if os.path.isfile("search_frame.py"):
233 os.remove("search_frame.py")
234 if os.path.isfile("ziggy_search"):
235 os.remove("ziggy_search")
236