1 import os
2 import subprocess
3 from . import hdmc
4 from hdmc import hdfs
5 from . hdmc.code_generator import CodeGenerator
6
7 tmp_directory = "/tmp"
8
9
11 c = CodeGenerator()
12 c.begin()
13 c.write("#! /usr/bin/env python\n\n")
14 c.write("import sys\n\n")
15 c.write("def read_input(file):\n")
16 c.indent()
17 c.write("for line in file:\n")
18 c.indent()
19 c.write("yield line.strip()\n")
20 c.dedent()
21 c.dedent()
22 c.write("\n\n")
23 c.write("def main():\n")
24 c.indent()
25 c.write("data = read_input(sys.stdin)\n")
26 c.write("for line in data:\n")
27 c.indent()
28 c.write("if len(line)>0:\n")
29 c.indent()
30 c.write("print 'ValueHistogram:'+line+':'+'1'\n")
31 c.dedent()
32 c.dedent()
33 c.dedent()
34 c.write("\n\n")
35 c.write('if __name__ == \"__main__\":\n')
36 c.indent()
37 c.write("main()\n")
38 c.write("\n")
39 frame_file = open("histogram_frame.py", "w")
40 print >> frame_file, c.end()
41 frame_file.close()
42 os.system("chmod a+rwx histogram_frame.py")
43
45 c = CodeGenerator()
46 c.begin()
47 c.write("#! /usr/bin/env python\n\n")
48 c.write("from collections import defaultdict\n")
49 c.write("import sys\n\n")
50 c.write("def read_input(file):\n")
51 c.indent()
52 c.write("for line in file:\n")
53 c.indent()
54 c.write("yield line.strip()\n")
55 c.dedent()
56 c.dedent()
57 c.write("\n\n")
58 c.write("def main():\n")
59 c.indent()
60 c.write("data = read_input(sys.stdin)\n")
61 c.write("c = defaultdict(int)\n")
62 c.write("for line in data:\n")
63 c.indent()
64 c.write("if len(line)>0:\n")
65 c.indent()
66 c.write("c[line] += 1\n")
67 c.dedent()
68 c.dedent()
69 c.write("for key in c.keys():\n")
70 c.indent()
71 c.write("print 'ValueHistogram:'+key+':'+str(c[key])\n")
72 c.dedent()
73 c.dedent()
74 c.write("\n\n")
75 c.write('if __name__ == \"__main__\":\n')
76 c.indent()
77 c.write("main()\n")
78 c.write("\n")
79 frame_file = open("histogram_frame.py", "w")
80 print >> frame_file, c.end()
81 frame_file.close()
82 os.system("chmod a+rwx histogram_frame.py")
83
85 c = CodeGenerator()
86 c.begin()
87 c.write("#! /usr/bin/env python\n\n")
88 c.write("import sys\n\n")
89 c.write("def read_input(file, pattern):\n")
90 c.indent()
91 c.write("line_count = 0\n")
92 c.write("for line in file:\n")
93 c.indent()
94 c.write("line_count += 1\n")
95 c.write("if pattern in line.strip():\n")
96 c.indent()
97 c.write("yield line.strip(), line_count\n")
98 c.dedent()
99 c.dedent()
100 c.dedent()
101 c.write("\n\n")
102 c.write("def main():\n")
103 c.indent()
104 c.write("search_pattern = open('ziggy_search').readline().strip()\n")
105 c.write("data = read_input(sys.stdin, search_pattern)\n")
106 c.write("for filename, count in data:\n")
107 c.indent()
108 if suppress_lines:
109 c.write("print filename.split(':')[0] + ':' + str(count)\n")
110 else:
111 c.write("print filename +':' + str(count)\n")
112 c.dedent()
113 c.dedent()
114 c.write("\n\n")
115 c.write('if __name__ == \"__main__\":\n')
116 c.indent()
117 c.write("main()\n")
118 c.write("\n")
119 frame_file = open("search_frame.py", "w")
120 print >> frame_file, c.end()
121 frame_file.close()
122 os.system("chmod a+rwx search_frame.py")
123
125 c = CodeGenerator()
126 c.begin()
127 c.write("#! /usr/bin/env python\n\n")
128 c.write("import sys\n\n")
129 c.write("def read_input(file):\n")
130 c.indent()
131 c.write("for line in file:\n")
132 c.indent()
133 c.write("yield line.strip()\n")
134 c.dedent()
135 c.dedent()
136 c.write("\n\n")
137 c.write("def main():\n")
138 c.indent()
139 c.write("data = read_input(sys.stdin)\n")
140 c.write("for line in data:\n")
141 c.indent()
142 c.write("if len(line)>0:\n")
143 c.indent()
144 c.write("print line\n")
145 c.dedent()
146 c.dedent()
147 c.dedent()
148 c.write("\n\n")
149 c.write('if __name__ == \"__main__\":\n')
150 c.indent()
151 c.write("main()\n")
152 c.write("\n")
153
154 frame_file = open("identity_frame.py", "w")
155 print >> frame_file, c.end()
156 frame_file.close()
157 os.system("chmod a+rwx identity_frame.py")
158
159 -def sort_numeric(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
170
171 -def sort_ascii(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
172 hdfs.copyToHDFS(input_file, input_file.split("/")[-1])
173 make_identity_frame()
174 if ascending:
175 keycomp = "r"
176 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers, keycomp)
177 else:
178 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers)
179 print hadoop_call
180 hdmc.execute_and_wait(hadoop_call)
181 hdfs.copyFromHDFS(output_file, output_file)
182
183 -def search(input_directory, input_files, output_file, search_pattern, suppress_lines=True, num_mappers=None, num_reducers=None):
184
185 hdfs.mkdir(input_directory)
186
187 for f in input_files:
188
189 tmpfile = open(tmp_directory+"/ziggy_search_tmp", "w")
190 original_file = open(f)
191 for line in original_file:
192 print >> tmpfile, f +":"+line
193 tmpfile.close()
194 original_file.close()
195 hdfs_location = input_directory+"/"+f.split("/")[-1]
196 hdfs.copyToHDFS(tmp_directory+"/ziggy_search_tmp", hdfs_location)
197
198 os.system("echo " + search_pattern + " > ziggy_search")
199
200 make_search_frame(suppress_lines)
201
202 make_identity_frame()
203
204 hadoop_call = hdmc.build_generic_hadoop_call("search_frame.py", "identity_frame.py", input_directory, output_file, ["./ziggy_search"], num_mappers, num_reducers)
205 hdmc.execute_and_wait(hadoop_call)
206 hdfs.copyFromHDFS(output_file, output_file)
207
208 -def histogram(input_file, output_file, num_mappers=None, num_reducers=None):
216