Package ziggy :: Module util
[hide private]
[frames] | no frames]

Source Code for Module ziggy.util

  1  import os 
  2  import subprocess 
  3  from . import hdmc 
  4  from hdmc import hdfs 
  5  from . hdmc.code_generator import CodeGenerator 
  6   
  7  tmp_directory = "/tmp" 
  8  __all__ = ["sort_numeric", "sort_ascii", "search", "histogram"] 
  9   
 10   
11 -def make_histogram_frame():
12 c = CodeGenerator() 13 c.begin() 14 c.write("#! /usr/bin/env python\n\n") 15 c.write("import sys\n\n") 16 c.write("def read_input(file):\n") 17 c.indent() 18 c.write("for line in file:\n") 19 c.indent() 20 c.write("yield line.strip()\n") 21 c.dedent() 22 c.dedent() 23 c.write("\n\n") 24 c.write("def main():\n") 25 c.indent() 26 c.write("data = read_input(sys.stdin)\n") 27 c.write("for line in data:\n") 28 c.indent() 29 c.write("if len(line)>0:\n") 30 c.indent() 31 c.write("print 'ValueHistogram:'+line+':'+'1'\n") 32 c.dedent() 33 c.dedent() 34 c.dedent() 35 c.write("\n\n") 36 c.write('if __name__ == \"__main__\":\n') 37 c.indent() 38 c.write("main()\n") 39 c.write("\n") 40 frame_file = open("histogram_frame.py", "w") 41 print >> frame_file, c.end() 42 frame_file.close() 43 os.system("chmod a+rwx histogram_frame.py")
44
45 -def make_counting_frame():
46 c = CodeGenerator() 47 c.begin() 48 c.write("#! /usr/bin/env python\n\n") 49 c.write("from collections import defaultdict\n") 50 c.write("import sys\n\n") 51 c.write("def read_input(file):\n") 52 c.indent() 53 c.write("for line in file:\n") 54 c.indent() 55 c.write("yield line.strip()\n") 56 c.dedent() 57 c.dedent() 58 c.write("\n\n") 59 c.write("def main():\n") 60 c.indent() 61 c.write("data = read_input(sys.stdin)\n") 62 c.write("c = defaultdict(int)\n") 63 c.write("for line in data:\n") 64 c.indent() 65 c.write("if len(line)>0:\n") 66 c.indent() 67 c.write("c[line] += 1\n") 68 c.dedent() 69 c.dedent() 70 c.write("for key in c.keys():\n") 71 c.indent() 72 c.write("print 'ValueHistogram:'+key+':'+str(c[key])\n") 73 c.dedent() 74 c.dedent() 75 c.write("\n\n") 76 c.write('if __name__ == \"__main__\":\n') 77 c.indent() 78 c.write("main()\n") 79 c.write("\n") 80 frame_file = open("histogram_frame.py", "w") 81 print >> frame_file, c.end() 82 frame_file.close() 83 os.system("chmod a+rwx histogram_frame.py")
84
85 -def make_search_frame(suppress_lines):
86 c = CodeGenerator() 87 c.begin() 88 c.write("#! /usr/bin/env python\n\n") 89 c.write("import sys\n\n") 90 c.write("def read_input(file, pattern):\n") 91 c.indent() 92 c.write("line_count = 0\n") 93 c.write("for line in file:\n") 94 c.indent() 95 c.write("line_count += 1\n") 96 c.write("if pattern in line.strip():\n") 97 c.indent() 98 c.write("yield line.strip(), line_count\n") 99 c.dedent() 100 c.dedent() 101 c.dedent() 102 c.write("\n\n") 103 c.write("def main():\n") 104 c.indent() 105 c.write("search_pattern = open('ziggy_search').readline().strip()\n") 106 c.write("data = read_input(sys.stdin, search_pattern)\n") 107 c.write("for filename, count in data:\n") 108 c.indent() 109 if suppress_lines: 110 c.write("print filename.split(':')[0] + ':' + str(count)\n") 111 else: 112 c.write("print filename +':' + str(count)\n") 113 c.dedent() 114 c.dedent() 115 c.write("\n\n") 116 c.write('if __name__ == \"__main__\":\n') 117 c.indent() 118 c.write("main()\n") 119 c.write("\n") 120 frame_file = open("search_frame.py", "w") 121 print >> frame_file, c.end() 122 frame_file.close() 123 os.system("chmod a+rwx search_frame.py")
124
125 -def make_identity_frame():
126 c = CodeGenerator() 127 c.begin() 128 c.write("#! /usr/bin/env python\n\n") 129 c.write("import sys\n\n") 130 c.write("def read_input(file):\n") 131 c.indent() 132 c.write("for line in file:\n") 133 c.indent() 134 c.write("yield line.strip()\n") 135 c.dedent() 136 c.dedent() 137 c.write("\n\n") 138 c.write("def main():\n") 139 c.indent() 140 c.write("data = read_input(sys.stdin)\n") 141 c.write("for line in data:\n") 142 c.indent() 143 c.write("if len(line)>0:\n") 144 c.indent() 145 c.write("print line\n") 146 c.dedent() 147 c.dedent() 148 c.dedent() 149 c.write("\n\n") 150 c.write('if __name__ == \"__main__\":\n') 151 c.indent() 152 c.write("main()\n") 153 c.write("\n") 154 155 frame_file = open("identity_frame.py", "w") 156 print >> frame_file, c.end() 157 frame_file.close() 158 os.system("chmod a+rwx identity_frame.py")
159
160 -def sort_numeric(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
161 '''Use MapReduce to sort a large set of numeric values.''' 162 hdfs.copyToHDFS(input_file, input_file.split("/")[-1]) 163 make_identity_frame() 164 if not ascending: 165 keycomp ="n" 166 else: 167 keycomp = "nr" 168 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers, keycomp) 169 print hadoop_call 170 hdmc.execute_and_wait(hadoop_call) 171 hdfs.copyFromHDFS(output_file, output_file) 172 cleanup()
173
174 -def sort_ascii(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
175 '''Use MapReduce to sort a large set of ASCII values.''' 176 hdfs.copyToHDFS(input_file, input_file.split("/")[-1]) 177 make_identity_frame() 178 if ascending: 179 keycomp = "r" 180 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers, keycomp) 181 else: 182 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers) 183 print hadoop_call 184 hdmc.execute_and_wait(hadoop_call) 185 hdfs.copyFromHDFS(output_file, output_file) 186 cleanup()
187
188 -def search(input_directory, input_files, output_file, search_pattern, suppress_lines=True, num_mappers=None, num_reducers=None):
189 '''Use MapReduce to search a collection of input files.''' 190 #make an HDFS directory for the input files 191 hdfs.mkdir(input_directory) 192 #put the files in that HDFS directory 193 for f in input_files: 194 #make a temp file with the filename attached 195 tmpfile = open(tmp_directory+"/ziggy_search_tmp", "w") 196 original_file = open(f) 197 for line in original_file: 198 print >> tmpfile, f +":"+line 199 tmpfile.close() 200 original_file.close() 201 hdfs_location = input_directory+"/"+f.split("/")[-1] 202 hdfs.copyToHDFS(tmp_directory+"/ziggy_search_tmp", hdfs_location) 203 os.remove(tmp_directory+"/ziggy_search_tmp") 204 #make a special input for the search pattern 205 os.system("echo " + search_pattern + " > ziggy_search") 206 #make a search frame 207 make_search_frame(suppress_lines) 208 #make an identity frame 209 make_identity_frame() 210 #build the hadoop call 211 hadoop_call = hdmc.build_generic_hadoop_call("search_frame.py", "identity_frame.py", input_directory, output_file, ["./ziggy_search"], num_mappers, num_reducers) 212 hdmc.execute_and_wait(hadoop_call) 213 hdfs.copyFromHDFS(output_file, output_file) 214 cleanup()
215
216 -def histogram(input_file, output_file, num_mappers=None, num_reducers=None):
217 '''Use MapReduce Aggregation to create a Histogram Report from large input file.''' 218 hdfs.copyToHDFS(input_file, input_file.split("/")[-1]) 219 #make histogram mapper 220 make_histogram_frame() 221 keycomp = "n" 222 hadoop_call = hdmc.build_generic_hadoop_call("histogram_frame.py", "aggregate", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers) 223 hdmc.execute_and_wait(hadoop_call) 224 hdfs.copyFromHDFS(output_file, output_file) 225 cleanup()
226
227 -def cleanup():
228 if os.path.isfile("identity_frame.py"): 229 os.remove("identity_frame.py") 230 if os.path.isfile("histogram_frame.py"): 231 os.remove("histogram_frame.py") 232 if os.path.isfile("search_frame.py"): 233 os.remove("search_frame.py") 234 if os.path.isfile("ziggy_search"): 235 os.remove("ziggy_search")
236