Package checkm :: Module checkm
[hide private]
[frames] | no frames]

Source Code for Module checkm.checkm

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """Checksumming convenience classes 
  5   
  6  TODO! Sorry! 
  7   
  8                  [@]SourceFileOrURL  Alg     Digest  Length   ModTime   TargetFileOrURL 
  9  TOKEN NUMBER:    1                  2       3       4        5         6 
 10   
 11  """ 
 12   
 13  from __future__ import with_statement 
 14   
 15  COLUMNS = { 0:"SourceFileOrURL", 
 16              1:"Alg", 
 17              2:"Digest", 
 18              3:"Length", 
 19              4:"ModTime", 
 20              5:"TargetFileOrURL", 
 21              } 
 22   
 23   
 24  import os, sys 
 25  from stat import * 
 26   
 27  import re 
 28   
 29  from collections import defaultdict 
 30   
 31  import hashlib 
 32   
 33  import codecs 
 34   
 35  import logging 
 36   
 37  logging.basicConfig(level=logging.INFO) 
 38   
 39  logger = logging.getLogger('checkm') 
 40   
41 -class NotFound(Exception):
42 """The item or directory was either not found, or not accessible."""
43 - def __init__(self, *arg, **kw):
44 """ 45 FIXME 46 @param *arg: 47 @type *arg: 48 @param **kw: 49 @type **kw: 50 """ 51 self.context = (arg, kw)
52 - def __repr__(self):
53 """ 54 FIXME 55 """ 56 return self.context.__str__()
57 - def __str__(self):
58 """ 59 FIXME 60 """ 61 return self.context.__str__()
62
63 -class CheckmReporter(object):
64 COLUMN_NAMES = [u'# [@]SourceFileOrURL',u'Alg',u'Digest',u'Length',u'ModTime']
65 - def __init__(self):
66 """ 67 FIXME 68 """ 69 self.scanner = CheckmScanner()
70
71 - def _get_max_len(self, report):
72 """ 73 FIXME 74 @param report: 75 @type report: 76 """ 77 cols = defaultdict(lambda : 0) 78 for line in report: 79 for index in xrange(len(line)): 80 if len(line[index])>cols[index]: 81 cols[index] = len(line[index]) 82 return cols
83
84 - def _space_line(self, line, col_maxes):
85 """ 86 FIXME 87 @param line: 88 @type line: 89 @param col_maxes: 90 @type col_maxes: 91 """ 92 spaced_line = [] 93 for index in xrange(len(line)): 94 spaced_line.append(line[index]) 95 spaces = col_maxes[index]-len(line[index])+4 96 spaced_line.append(u" "*spaces) 97 return u"".join(spaced_line)
98
99 - def create_bagit_manifest(self, scan_directory, algorithm, recursive=False, delimiter = " ", filename=None):
100 """ 101 FIXME 102 @param scan_directory: 103 @type scan_directory: 104 @param algorithm: 105 @type algorithm: 106 @param recursive=False: 107 @type recursive=False: 108 @param delimiter: 109 @type delimiter: 110 @param filename=None: 111 @type filename=None: 112 """ 113 if not filename: 114 filename = "manifest-%s.txt" % algorithm 115 logger.info("Creating bagit manifest file(%s) for dir(%s) with Alg:%s" % (filename, 116 scan_directory, 117 algorithm)) 118 report = self.scanner.scan_directory(scan_directory, algorithm, recursive=recursive, columns=3) 119 if hasattr(filename, 'write'): 120 for line in report: 121 if line[2] != "d": 122 filename.write("%s%s%s\n" % (line[2], delimiter, line[0])) 123 filename.write("\n") 124 else: 125 with codecs.open(filename, encoding='utf-8', mode="w") as output: 126 for line in report: 127 if line[2] != "d": 128 output.write("%s%s%s\n" % (line[2], delimiter, line[0])) 129 output.write("\n") 130 return filename
131
132 - def create_multilevel_checkm(self, top_directory, algorithm, checkm_filename, columns=3):
133 logger.info("Creating multilevel checkm files '(%s)' from top level directory(%s) with Alg:%s and columns:%s" % (checkm_filename, top_directory, algorithm, columns)) 134 if not os.path.isdir(top_directory): 135 raise NotFound(top_directory=top_directory) 136 # Gather list of directories to scan 137 # And their subdirectories 138 # bottom up! 139 dirs = dict([(root, dirnames) for (root, dirnames, _) in os.walk(top_directory, topdown=False)]) 140 # per directory 141 for dirname in dirs: 142 with codecs.open(os.path.join(dirname, checkm_filename), encoding='utf-8', mode="w") as output: 143 self.create_checkm_file(dirname, 144 algorithm, 145 os.path.join(dirname, checkm_filename), 146 recursive=False, 147 columns=columns, 148 checkm_file=output) 149 subdir_report = [] 150 for subdir in dirs[dirname]: 151 try: 152 line = self.scanner.scan_path(os.path.join(subdir, checkm_filename), algorithm, columns) 153 line[0] = '@%s' % (line[0]) 154 subdir_report.append(line) 155 except Exception, e: 156 print "Fail! %s" % e 157 col_maxes = self._get_max_len(subdir_report) 158 for line in subdir_report: 159 output.write('%s\n' % (self._space_line(line, col_maxes))) 160 output.write('\n')
161
162 - def create_checkm_file(self, scan_directory, algorithm, checkm_filename, recursive=False, columns=3, checkm_file=None):
163 logger.info("Creating checkm file for dir(%s) with Alg:%s and columns: %s" % ( 164 scan_directory, 165 algorithm, columns)) 166 report = self.scanner.scan_directory(scan_directory, algorithm, recursive=recursive, columns=columns) 167 col_maxes = self._get_max_len(report) 168 if checkm_file != None and hasattr(checkm_file, 'write'): 169 checkm_file.write("%s \n" % (self._space_line(CheckmReporter.COLUMN_NAMES[:columns], col_maxes))) 170 for line in report: 171 logger.info("Checking that the scanned file is not the empty checkm file") 172 logger.info("Scanned file: %s vs Checkm filename: %s" % (line[0], checkm_filename)) 173 if line[0] != checkm_filename: 174 checkm_file.write("%s\n" % (self._space_line(line, col_maxes))) 175 else: 176 logger.info("MATCH! - scan line ignored") 177 return checkm_file 178 else: 179 with codecs.open(checkm_filename, encoding='utf-8', mode="w") as output: 180 output.write("%s \n" % (self._space_line(CheckmReporter.COLUMN_NAMES[:columns], col_maxes))) 181 for line in report: 182 if line[0] != os.path.join(scan_directory, checkm_filename): 183 output.write("%s\n" % (self._space_line(line, col_maxes))) 184 output.write("\n")
185
186 - def check_bagit_hashes(self, bagit_filename, algorithm=None):
187 """ 188 FIXME 189 @param bagit_filename: 190 @type bagit_filename: 191 @param algorithm=None: 192 @type algorithm=None: 193 """ 194 logger.info("Checking files against '%s' bagit manifest" % bagit_filename) 195 if algorithm == None: 196 if hasattr(bagit_filename, 'read'): 197 raise Exception("Need to supply the algorithm when passing a filelike object instead of a filename") 198 m = re.search("manifest-(?P<alg>[^\.]+)\.txt", bagit_filename) 199 if m != None: 200 algorithm = m.groupdict()['alg'] 201 parser = BagitParser(bagit_filename) 202 scanner = CheckmScanner() 203 results = {'pass':[], 'fail':{}} 204 for row in parser: 205 if row: 206 try: 207 scan_row = scanner.scan_path(row[1], algorithm, 3) 208 if row[0] != scan_row[2]: 209 logger.info("Failed original: %s" % row) 210 logger.info("Current scan: %s" % scan_row) 211 results['fail'][row[1]] = (row, scan_row) 212 else: 213 results['pass'].append(row[1]) 214 except NotFound: 215 scan_row = "File not found" 216 logger.info("Failed original: %s" % row) 217 logger.info("But file not found at this path.") 218 results['fail'][row[1]] = (row, scan_row) 219 return results
220
221 - def check_checkm_hashes(self, scan_directory, checkm_filename, ignore_multilevel=True):
222 """ 223 FIXME 224 @param scan_directory: 225 @type scan_directory: 226 @param checkm_filename: 227 @type checkm_filename: 228 """ 229 def _check_files_against_parser(parser): 230 scanner = CheckmScanner() 231 results = {'pass':[], 'fail':{}, 'include':[]} 232 for row in parser: 233 if row: 234 try: 235 if row[0].startswith('@'): 236 row[0] = row[0][1:] 237 results['include'].append(row[0]) 238 scan_row = scanner.scan_path(row[0], row[1], len(row)) 239 if row != scan_row: 240 logger.info("Failed original: %s" % row) 241 logger.info("Current scan: %s" % scan_row) 242 results['fail'][row[0]] = (row, scan_row) 243 else: 244 results['pass'].append(row[0]) 245 except NotFound: 246 scan_row = "File not found" 247 logger.info("Failed original: %s" % row) 248 logger.info("But file not found at this path.") 249 results['fail'][row[0]] = (row, scan_row) 250 return results
251 252 logger.info("Checking files against %s checkm manifest" % checkm_filename) 253 parser = CheckmParser(checkm_filename) 254 results = _check_files_against_parser(parser) 255 if ignore_multilevel: 256 return results 257 else: 258 # shallow copy of the include list, as we will be pop'ing off items 259 checkm_list = results['include'][:] 260 while checkm_list: 261 checkm_file = checkm_list.pop() 262 parser = CheckmParser(checkm_file) 263 additional_results = _check_files_against_parser(parser) 264 # Add to the passes 265 results['pass'].extend(additional_results['pass']) 266 # add to the overall list of 267 results['include'].extend(additional_results['include']) 268 checkm_list.extend(additional_results['include']) 269 # add to the fail dict 270 results['fail'].update(additional_results['fail']) 271 return results
272
273 -class BagitParser(object):
274 - def __init__(self, bagit_file=None):
275 """ 276 FIXME 277 @param bagit_file=None: 278 @type bagit_file=None: 279 """ 280 self.status = False 281 self.lines = [] 282 if bagit_file: 283 self.parse(bagit_file)
284
285 - def __iter__(self):
286 """ 287 FIXME 288 """ 289 class Bagit_iter: 290 def __init__(self, lines): 291 """ 292 FIXME 293 @param lines: 294 @type lines: 295 """ 296 self.lines = lines 297 self.last = 0
298 def __iter__(self): 299 """ 300 FIXME 301 """ 302 return self
303 def next(self): 304 """ 305 FIXME 306 """ 307 if self.last >= len(self.lines): # threshhold terminator 308 raise StopIteration 309 elif len(self.lines) == 0: 310 raise StopIteration 311 else: 312 self.last += 1 313 return self.lines[self.last-1] 314 return Bagit_iter(self.lines) 315
316 - def parse(self, fileobj):
317 """ 318 FIXME 319 @param fileobj: 320 @type fileobj: 321 """ 322 if not hasattr(fileobj, "read"): 323 with codecs.open(fileobj, encoding='utf-8', mode="r") as check_fh: 324 self._parse_lines(check_fh) 325 else: 326 self._parse_lines(fileobj) 327 return self.lines
328
329 - def _parse_lines(self, fh):
330 """ 331 FIXME 332 @param fh: 333 @type fh: 334 """ 335 self.lines = [] # clear the deck 336 line_buffer = "" 337 def _parse_line(line): 338 """ 339 FIXME 340 @param line: 341 @type line: 342 """ 343 if not line.startswith('#'): 344 tokens = filter(lambda x: x, re.split("\s+", line, 1)) # 2 columns 345 logger.info(tokens) 346 if tokens: 347 # handle "\s*\*" situation 348 if tokens[1].startswith("*"): 349 tokens[1] = tokens[1][1:].strip() 350 self.lines.append(tokens)
351 for chunk in fh.read(0x1000): 352 line_buffer = line_buffer + chunk 353 while True: 354 if not line_buffer: 355 break 356 fragments = line_buffer.split('\n',1) 357 if len(fragments) == 1: 358 break 359 _parse_line(fragments[0]) 360 line_buffer = fragments[1] 361
362 -class CheckmParser(object):
363 - def __init__(self, checkm_file=None):
364 """ 365 FIXME 366 @param checkm_file=None: 367 @type checkm_file=None: 368 """ 369 self.status = False 370 self.lines = [] 371 if checkm_file: 372 self.parse(checkm_file)
373
374 - def __iter__(self):
375 """ 376 FIXME 377 """ 378 class Checkm_iter: 379 def __init__(self, lines): 380 """ 381 FIXME 382 @param lines: 383 @type lines: 384 """ 385 self.lines = lines 386 self.last = 0
387 def __iter__(self): 388 """ 389 FIXME 390 """ 391 return self
392 def next(self): 393 """ 394 FIXME 395 """ 396 if self.last >= len(self.lines): # threshhold terminator 397 raise StopIteration 398 elif len(self.lines) == 0: 399 raise StopIteration 400 else: 401 self.last += 1 402 return self.lines[self.last-1] 403 return Checkm_iter(self.lines) 404
405 - def parse(self, checkm_file):
406 """ 407 FIXME 408 @param checkm_file: 409 @type checkm_file: 410 """ 411 if not hasattr(checkm_file, "readline"): 412 if os.path.isfile(checkm_file): 413 with codecs.open(checkm_file, encoding='utf-8', mode="r") as check_fh: 414 self._parse_lines(check_fh) 415 else: 416 raise NotFound(checkm_file=checkm_file) 417 else: 418 self._parse_lines(checkm_file) 419 return self.lines
420
421 - def _parse_lines(self, fh):
422 """ 423 FIXME 424 @param fh: 425 @type fh: 426 """ 427 self.lines = [] # clear the deck 428 line_buffer = "" 429 def _parse_line(line): 430 """ 431 FIXME 432 @param line: 433 @type line: 434 """ 435 if not line.startswith('#'): 436 tokens = filter(lambda x: x, re.split("\s+", line, 5)) # 6 column max defn == 5 splits 437 logger.info(tokens) 438 if tokens: 439 self.lines.append(tokens)
440 441 for chunk in fh.read(0x1000): 442 line_buffer = line_buffer + chunk 443 while True: 444 if not line_buffer: 445 break 446 fragments = line_buffer.split('\n',1) 447 if len(fragments) == 1: 448 break 449 _parse_line(fragments[0]) 450 line_buffer = fragments[1] 451
452 -class CheckmScanner(object):
453 HASHTYPES = ['md5', 'sha1', 'sha224','sha256','sha384','sha512']
454 - def scan_local(self, directory_path, algorithm, columns=3):
455 """ 456 FIXME 457 @param directory_path: 458 @type directory_path: 459 @param algorithm: 460 @type algorithm: 461 @param columns=3: 462 @type columns=3: 463 """ 464 report = [] 465 for item in os.listdir(directory_path): 466 item_path = os.path.join(directory_path, item) 467 report.append(self.scan_path(item_path, algorithm, columns)) 468 return report
469
470 - def scan_tree(self, directory_path, algorithm, columns):
471 """ 472 FIXME 473 @param directory_path: 474 @type directory_path: 475 @param algorithm: 476 @type algorithm: 477 @param columns: 478 @type columns: 479 """ 480 report = [] 481 if os.path.exists(directory_path): 482 for (dirpath, dirnames, filenames) in os.walk(directory_path): 483 for item_path in [os.path.join(dirpath, x) for x in dirnames+filenames]: 484 report.append(self.scan_path(item_path, algorithm, columns)) 485 return report 486 else: 487 raise NotFound(directory_path=directory_path, recursive=True)
488
489 - def scan_path(self, item_path, algorithm, columns):
490 """ 491 FIXME 492 @param item_path: 493 @type item_path: 494 @param algorithm: 495 @type algorithm: 496 @param columns: 497 @type columns: 498 """ 499 if columns<3 or not isinstance(columns, int): 500 columns = 3 501 try: 502 line = [] 503 # col 1 504 line.append(unicode(item_path)) 505 # col 2 506 line.append(unicode(algorithm)) 507 # col 3 508 if os.path.isdir(item_path): 509 line.append(u'd') 510 else: 511 # No need to catch the ValueError from 512 hash_gen = getattr(hashlib, algorithm)() 513 with open(item_path, 'rb') as fh: 514 logger.info("Checking %s with algorithm %s" % (item_path, algorithm)) 515 chunk = fh.read(1024*8) 516 while chunk: 517 hash_gen.update(chunk) 518 chunk= fh.read(1024*8) 519 line.append(unicode(hash_gen.hexdigest())) 520 if columns>3: 521 # col4 - Length 522 line.append(unicode(os.stat(item_path)[ST_SIZE])) 523 if columns>4: 524 # col 5 - ModTime 525 line.append(unicode(os.stat(item_path)[ST_MTIME])) 526 return line 527 except OSError: 528 raise NotFound(item_path=item_path) 529 except IOError: 530 raise NotFound(item_path=item_path) 531 except AttributeError: 532 raise ValueError("This tool cannot perform hashtype %s" % algorithm)
533
534 - def scan_directory(self, directory_path, algorithm, recursive=False, columns=3):
535 """ 536 FIXME 537 @param directory_path: 538 @type directory_path: 539 @param algorithm: 540 @type algorithm: 541 @param recursive=False: 542 @type recursive=False: 543 @param columns=3: 544 @type columns=3: 545 """ 546 if os.path.exists(directory_path): 547 if recursive: 548 return self.scan_tree(directory_path, algorithm, columns) 549 return self.scan_local(directory_path, algorithm, columns) 550 else: 551 raise NotFound(directory_path=directory_path, recursive=recursive)
552