Package ziggy :: Module util
[hide private]
[frames] | no frames]

Source Code for Module ziggy.util

  1  import os 
  2  import subprocess 
  3  from . import hdmc 
  4  from hdmc import hdfs 
  5  from . hdmc.code_generator import CodeGenerator 
  6   
  7  tmp_directory = "/tmp" 
  8   
  9   
10 -def make_histogram_frame():
11 c = CodeGenerator() 12 c.begin() 13 c.write("#! /usr/bin/env python\n\n") 14 c.write("import sys\n\n") 15 c.write("def read_input(file):\n") 16 c.indent() 17 c.write("for line in file:\n") 18 c.indent() 19 c.write("yield line.strip()\n") 20 c.dedent() 21 c.dedent() 22 c.write("\n\n") 23 c.write("def main():\n") 24 c.indent() 25 c.write("data = read_input(sys.stdin)\n") 26 c.write("for line in data:\n") 27 c.indent() 28 c.write("if len(line)>0:\n") 29 c.indent() 30 c.write("print 'ValueHistogram:'+line+':'+'1'\n") 31 c.dedent() 32 c.dedent() 33 c.dedent() 34 c.write("\n\n") 35 c.write('if __name__ == \"__main__\":\n') 36 c.indent() 37 c.write("main()\n") 38 c.write("\n") 39 frame_file = open("histogram_frame.py", "w") 40 print >> frame_file, c.end() 41 frame_file.close() 42 os.system("chmod a+rwx histogram_frame.py")
43
44 -def make_counting_frame():
45 c = CodeGenerator() 46 c.begin() 47 c.write("#! /usr/bin/env python\n\n") 48 c.write("from collections import defaultdict\n") 49 c.write("import sys\n\n") 50 c.write("def read_input(file):\n") 51 c.indent() 52 c.write("for line in file:\n") 53 c.indent() 54 c.write("yield line.strip()\n") 55 c.dedent() 56 c.dedent() 57 c.write("\n\n") 58 c.write("def main():\n") 59 c.indent() 60 c.write("data = read_input(sys.stdin)\n") 61 c.write("c = defaultdict(int)\n") 62 c.write("for line in data:\n") 63 c.indent() 64 c.write("if len(line)>0:\n") 65 c.indent() 66 c.write("c[line] += 1\n") 67 c.dedent() 68 c.dedent() 69 c.write("for key in c.keys():\n") 70 c.indent() 71 c.write("print 'ValueHistogram:'+key+':'+str(c[key])\n") 72 c.dedent() 73 c.dedent() 74 c.write("\n\n") 75 c.write('if __name__ == \"__main__\":\n') 76 c.indent() 77 c.write("main()\n") 78 c.write("\n") 79 frame_file = open("histogram_frame.py", "w") 80 print >> frame_file, c.end() 81 frame_file.close() 82 os.system("chmod a+rwx histogram_frame.py")
83
84 -def make_search_frame(suppress_lines):
85 c = CodeGenerator() 86 c.begin() 87 c.write("#! /usr/bin/env python\n\n") 88 c.write("import sys\n\n") 89 c.write("def read_input(file, pattern):\n") 90 c.indent() 91 c.write("line_count = 0\n") 92 c.write("for line in file:\n") 93 c.indent() 94 c.write("line_count += 1\n") 95 c.write("if pattern in line.strip():\n") 96 c.indent() 97 c.write("yield line.strip(), line_count\n") 98 c.dedent() 99 c.dedent() 100 c.dedent() 101 c.write("\n\n") 102 c.write("def main():\n") 103 c.indent() 104 c.write("search_pattern = open('ziggy_search').readline().strip()\n") 105 c.write("data = read_input(sys.stdin, search_pattern)\n") 106 c.write("for filename, count in data:\n") 107 c.indent() 108 if suppress_lines: 109 c.write("print filename.split(':')[0] + ':' + str(count)\n") 110 else: 111 c.write("print filename +':' + str(count)\n") 112 c.dedent() 113 c.dedent() 114 c.write("\n\n") 115 c.write('if __name__ == \"__main__\":\n') 116 c.indent() 117 c.write("main()\n") 118 c.write("\n") 119 frame_file = open("search_frame.py", "w") 120 print >> frame_file, c.end() 121 frame_file.close() 122 os.system("chmod a+rwx search_frame.py")
123
124 -def make_identity_frame():
125 c = CodeGenerator() 126 c.begin() 127 c.write("#! /usr/bin/env python\n\n") 128 c.write("import sys\n\n") 129 c.write("def read_input(file):\n") 130 c.indent() 131 c.write("for line in file:\n") 132 c.indent() 133 c.write("yield line.strip()\n") 134 c.dedent() 135 c.dedent() 136 c.write("\n\n") 137 c.write("def main():\n") 138 c.indent() 139 c.write("data = read_input(sys.stdin)\n") 140 c.write("for line in data:\n") 141 c.indent() 142 c.write("if len(line)>0:\n") 143 c.indent() 144 c.write("print line\n") 145 c.dedent() 146 c.dedent() 147 c.dedent() 148 c.write("\n\n") 149 c.write('if __name__ == \"__main__\":\n') 150 c.indent() 151 c.write("main()\n") 152 c.write("\n") 153 154 frame_file = open("identity_frame.py", "w") 155 print >> frame_file, c.end() 156 frame_file.close() 157 os.system("chmod a+rwx identity_frame.py")
158
159 -def sort_numeric(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
160 hdfs.copyToHDFS(input_file, input_file.split("/")[-1]) 161 make_identity_frame() 162 if not ascending: 163 keycomp ="n" 164 else: 165 keycomp = "nr" 166 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers, keycomp) 167 print hadoop_call 168 hdmc.execute_and_wait(hadoop_call) 169 hdfs.copyFromHDFS(output_file, output_file)
170
171 -def sort_ascii(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
172 hdfs.copyToHDFS(input_file, input_file.split("/")[-1]) 173 make_identity_frame() 174 if ascending: 175 keycomp = "r" 176 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers, keycomp) 177 else: 178 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers) 179 print hadoop_call 180 hdmc.execute_and_wait(hadoop_call) 181 hdfs.copyFromHDFS(output_file, output_file)
182
183 -def search(input_directory, input_files, output_file, search_pattern, suppress_lines=True, num_mappers=None, num_reducers=None):
184 #make an HDFS directory for the input files 185 hdfs.mkdir(input_directory) 186 #put the files in that HDFS directory 187 for f in input_files: 188 #make a temp file with the filename attached 189 tmpfile = open(tmp_directory+"/ziggy_search_tmp", "w") 190 original_file = open(f) 191 for line in original_file: 192 print >> tmpfile, f +":"+line 193 tmpfile.close() 194 original_file.close() 195 hdfs_location = input_directory+"/"+f.split("/")[-1] 196 hdfs.copyToHDFS(tmp_directory+"/ziggy_search_tmp", hdfs_location) 197 #make a special input for the search pattern 198 os.system("echo " + search_pattern + " > ziggy_search") 199 #make a search frame 200 make_search_frame(suppress_lines) 201 #make an identity frame 202 make_identity_frame() 203 #build the hadoop call 204 hadoop_call = hdmc.build_generic_hadoop_call("search_frame.py", "identity_frame.py", input_directory, output_file, ["./ziggy_search"], num_mappers, num_reducers) 205 hdmc.execute_and_wait(hadoop_call) 206 hdfs.copyFromHDFS(output_file, output_file)
207
208 -def histogram(input_file, output_file, num_mappers=None, num_reducers=None):
209 hdfs.copyToHDFS(input_file, input_file.split("/")[-1]) 210 #make histogram mapper 211 make_histogram_frame() 212 keycomp = "n" 213 hadoop_call = hdmc.build_generic_hadoop_call("histogram_frame.py", "aggregate", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers) 214 hdmc.execute_and_wait(hadoop_call) 215 hdfs.copyFromHDFS(output_file, output_file)
216