Package cssutils :: Package scripts :: Module csscapture
[hide private]
[frames] | no frames]

Source Code for Module cssutils.scripts.csscapture

  1  #!/usr/bin/env python 
  2  """Retrieve all CSS stylesheets including embedded for a given URL. 
  3  Retrieve as StyleSheetList or save to disk - raw, parsed or minified version. 
  4   
  5  TODO: 
  6  - maybe use DOM 3 load/save? 
  7  - logger class which handles all cases when no log is given... 
  8  - saveto: why does urllib2 hang? 
  9  """ 
 10  __all__ = ['CSSCapture'] 
 11  __docformat__ = 'restructuredtext' 
 12  __author__ = '$LastChangedBy: cthedot $' 
 13  __date__ = '$LastChangedDate: 2008-01-27 17:28:41 +0100 (So, 27 Jan 2008) $' 
 14  __version__ = '$LastChangedRevision: 950 $' 
 15   
 16  import codecs 
 17  import errno 
 18  import HTMLParser 
 19  import logging 
 20  import os 
 21  import sys 
 22  import urllib2 
 23  import urlparse 
 24   
 25  import cssutils 
 26  try: 
 27      import encutils 
 28  except ImportError: 
 29      try: 
 30          import cssutils.encutils as encutils 
 31      except ImportError: 
 32          sys.exit("You need encutils from http://cthedot.de/encutils/") 
 33   
34 -class CSSCaptureHTMLParser(HTMLParser.HTMLParser):
35 """ parses given data for link and style elements """ 36 curtag = u'' 37 links = [] 38 # list of attrsdict 39 styles = [] 40 # list of (attrsdict, data) 41
42 - def _lowerattrs(self, attrs):
43 return dict([(a.lower(), v.lower()) for a, v in attrs])
44
45 - def handle_starttag(self, tag, attrs):
46 if tag == u'link': 47 attrs = self._lowerattrs(attrs) 48 if attrs.get(u'type', u'') == u'text/css': 49 self.links.append(attrs) 50 # also get content of tag 51 elif tag == u'style': 52 attrs = self._lowerattrs(attrs) 53 if attrs.get(u'type', u'') == u'text/css': 54 self.styles.append((attrs, u'')) 55 self.curtag = tag 56 else: 57 # close as style cannot contain any elements 58 self.curtag = u''
59
60 - def handle_data(self, data):
61 if self.curtag == u'style': 62 self.styles[-1] = (self.styles[-1][0], data)
63
64 - def handle_comment(self, data):
65 # style might have comment content, treat same as data 66 self.handle_data(data)
67
68 - def handle_endtag(self, tag):
69 # close as style cannot contain any elements 70 self.curtag = u''
71
72 -class CSSCapture(object):
73 """ 74 Retrieve all CSS stylesheets including embedded for a given URL. 75 Optional setting of User-Agent used for retrieval possible 76 to handle browser sniffing servers. 77 78 raises urllib2.HTTPError 79 """
80 - def __init__(self, ua=None, log=None, defaultloglevel=logging.INFO):
81 """ 82 initialize a new Capture object 83 84 ua 85 init User-Agent to use for requests 86 log 87 supply a log object which is used instead of the default 88 log which writes to sys.stderr 89 defaultloglevel 90 constant of logging package which defines the level of the 91 default log if no explicit log given 92 """ 93 self._ua = ua 94 self._parser = CSSCaptureHTMLParser() 95 96 if log: 97 self._log = log 98 else: 99 self._log = logging.getLogger('CSSCapture') 100 hdlr = logging.StreamHandler(sys.stderr) 101 formatter = logging.Formatter('%(message)s') 102 hdlr.setFormatter(formatter) 103 self._log.addHandler(hdlr) 104 self._log.setLevel(defaultloglevel) 105 self._log.debug(u'Using default log')
106
107 - def _doRequest(self, url):
108 """ 109 Does an HTTP request 110 111 Returns: (response, url) 112 113 url might have been changed by server due to redirects etc 114 """ 115 self._log.debug(u' CSSCapture._doRequest\n * URL: %s' % url) 116 117 req = urllib2.Request(url) 118 if self._ua: 119 req.add_header('User-agent', self._ua) 120 self._log.info(' * Using User-Agent: %s', self._ua) 121 try: 122 res = urllib2.urlopen(req) 123 except urllib2.HTTPError, e: 124 self._log.critical(' %s\n%s %s\n%s' % ( 125 e.geturl(), e.code, e.msg, e.headers)) 126 return None, None 127 128 # get real url 129 if url != res.geturl(): 130 url = res.geturl() 131 self._log.info(' URL retrieved: %s', url) 132 133 return res, url
134
135 - def _createStyleSheet(self, href=None, 136 media=None, 137 parentStyleSheet=None, 138 title=u'', 139 cssText=None, 140 encoding=None):
141 """ 142 returns CSSStyleSheet read from href or if cssText is given use that 143 144 encoding 145 used if inline style found, same as self.docencoding 146 """ 147 if not cssText: 148 res, href = self._doRequest(href) 149 if res: 150 if not encoding: 151 media_type, encoding = encutils.getHTTPInfo(res) 152 if media_type != u'text/css': 153 self._log.warn(u' WARNING: HTTP media type is different than expected "text/css": %r' % 154 media_type) 155 try: 156 cssText = codecs.getreader('css')(res, 157 encoding=encoding).read() 158 except UnicodeDecodeError, e: 159 self._log.error(u' Error retrieving CSS, probably encoding mismatch:\n\t%s\n\t%s' 160 % (href, e)) 161 return None 162 else: 163 self._log.error(u' ERROR accessing CSS\n\t' % href) 164 return None 165 166 sheet = cssutils.parseString(cssText) 167 sheet.href = href 168 sheet.media = media 169 sheet._parentStyleSheet = parentStyleSheet 170 sheet.title = title 171 self._log.debug(u' * title: %s', title) 172 if href: 173 self._log.info(u' * href : %s', href) 174 self._log.info(u' * media: %s', media.mediaText) 175 self._log.info(u' %s\n' % sheet) 176 self._log.debug(u' * cssText:\n%s\n', cssText) 177 178 self._nonparsed[sheet] = cssText 179 return sheet
180
181 - def _doImports(self, parentStyleSheet, baseurl=None):
182 """ 183 handle all @import CSS stylesheet recusively 184 found CSS stylesheets are appended to stylesheetlist 185 """ 186 for rule in parentStyleSheet.cssRules: 187 if rule.type == rule.IMPORT_RULE: 188 self._log.info(u'\n@import FOUND -----') 189 self._log.debug(u' IN: %s\n' % parentStyleSheet) 190 href = urlparse.urljoin(baseurl, rule.href) 191 sheet = self._createStyleSheet( 192 href=href, 193 media=rule.media, 194 parentStyleSheet=parentStyleSheet) 195 if sheet: 196 self.stylesheetlist.append(sheet) 197 self._doImports(sheet, baseurl=href)
198
199 - def _findStyleSheets(self, docurl, doctext):
200 """ 201 parse text for stylesheets 202 fills stylesheetlist with all found StyleSheets 203 204 docurl 205 to build a full url of found StyleSheets @href 206 doctext 207 to parse 208 """ 209 self._parser.feed(doctext) 210 # <link>ed stylesheets, ownerNode should be set to the <link> node 211 for link in self._parser.links: 212 self._log.info(u'\n<link> FOUND -----') 213 self._log.debug(u' %s\n' % link) 214 href = urlparse.urljoin(docurl, link.get(u'href', u'')) 215 sheet = self._createStyleSheet( 216 href=href, 217 media=cssutils.stylesheets.MediaList( 218 link.get(u'media', u'')), 219 title=link.get(u'title', u'')) 220 if sheet: 221 self.stylesheetlist.append(sheet) 222 self._doImports(sheet, baseurl=href) 223 224 # internal <style> sheets 225 # href is None for internal stylesheets 226 # ownerNode should be set to the <style> node 227 for style in self._parser.styles: 228 stylemeta, cssText = style 229 self._log.info(u'\n<style> FOUND -----' ) 230 self._log.debug(u' %s\n' % stylemeta) 231 sheet = self._createStyleSheet( 232 media=cssutils.stylesheets.MediaList( 233 stylemeta.get(u'media', u'')), 234 title=stylemeta.get(u'title', u''), 235 cssText=cssText, 236 encoding=self.docencoding) 237 if sheet: 238 self.stylesheetlist.append(sheet) 239 self._doImports(sheet, baseurl=docurl)
240
241 - def capture(self, url, ua=None):
242 """ 243 Capture stylesheets for the given url, any HTTPError is raised to 244 caller. 245 246 url 247 to capture CSS from 248 ua 249 User-Agent to use for requests 250 251 Returns StyleSheetList. 252 """ 253 if ua is not None: 254 self._ua = ua 255 256 self._log.info(u'\nCapturing CSS from URL: %s\n', url) 257 self.stylesheetlist = cssutils.stylesheets.StyleSheetList() 258 259 # used to save inline styles 260 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 261 self._filename = os.path.basename(path) 262 263 # get url content 264 res, url = self._doRequest(url) 265 if not res: 266 sys.exit(1) 267 rawdoc = res.read() 268 269 self.docencoding = encutils.getEncodingInfo( 270 res, rawdoc, log=self._log).encoding 271 self._log.info(u'\nUsing Encoding: %s\n', self.docencoding) 272 273 doctext = unicode(rawdoc, self.docencoding) 274 275 # fill list of stylesheets and list of raw css 276 self._nonparsed = {} 277 self._findStyleSheets(url, doctext) 278 279 return self.stylesheetlist
280
281 - def saveto(self, dir, saveraw=False, minified=False):
282 """ 283 saves css in "dir" in the same layout as on the server 284 internal stylesheets are saved as "dir/__INLINE_STYLE__.html.css" 285 286 dir 287 directory to save files to 288 saveparsed 289 save literal CSS from server or save the parsed CSS 290 minified 291 save minified CSS 292 293 Both parsed and minified (which is also parsed of course) will 294 loose information which cssutils is unable to understand or where 295 it is simple buggy. You might to first save the raw version before 296 parsing of even minifying it. 297 """ 298 msg = 'parsed' 299 if saveraw: 300 msg = 'raw' 301 if minified: 302 cssutils.ser.prefs.useMinified() 303 msg = 'minified' 304 305 inlines = 0 306 for sheet in self.stylesheetlist: 307 url = sheet.href 308 if not url: 309 inlines += 1 310 url = '%s_INLINE_%s.css' % ( 311 self._filename, inlines) 312 313 # build savepath 314 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 315 # no absolute path 316 if path and path.startswith('/'): 317 path = path[1:] 318 path = os.path.normpath(path) 319 path, fn = os.path.split(path) 320 savepath = os.path.join(dir, loc, path) 321 savefn = os.path.join(savepath, fn) 322 try: 323 os.makedirs(savepath) 324 except OSError, e: 325 if e.errno != errno.EEXIST: 326 raise e 327 self._log.debug(u'Path "%s" already exists.', savepath) 328 329 if saveraw: 330 cssText = self._nonparsed[sheet] 331 else: 332 cssText = sheet.cssText 333 334 self._log.info(u'Saving %s "%s"' % (msg, savefn)) 335 sf = open(savefn, 'wb') 336 uf = codecs.getwriter('css')(sf) 337 uf.write(cssText) 338 sf.close()
339
340 -def main(args=None):
341 import optparse 342 343 usage = "usage: %prog [options] URL" 344 parser = optparse.OptionParser(usage=usage) 345 parser.add_option('-d', '--debug', action='store_true', dest='debug', 346 help='show debug messages during capturing') 347 parser.add_option('-m', '--minified', action='store_true', dest='minified', 348 help='saves minified version of captured files') 349 parser.add_option('-n', '--notsave', action='store_true', dest='notsave', 350 help='if given files are NOT saved, only log is written') 351 parser.add_option('-r', '--saveraw', action='store_true', dest='saveraw', 352 help='if given saves raw css otherwise cssutils\' parsed files') 353 parser.add_option('-s', '--saveto', action='store', dest='saveto', 354 help='saving retrieved files to "saveto", defaults to "_CSSCapture_SAVED"') 355 parser.add_option('-u', '--useragent', action='store', dest='ua', 356 help='useragent to use for request of URL, default is urllib2s default') 357 options, url = parser.parse_args() 358 359 if not url: 360 parser.error('no URL given') 361 else: 362 url = url[0] 363 364 if options.debug: 365 dll = logging.DEBUG 366 else: 367 dll = logging.INFO 368 369 # START 370 c = CSSCapture(defaultloglevel=dll) 371 372 stylesheetlist = c.capture(url, ua=options.ua) 373 374 if options.notsave is None or not options.notsave: 375 if options.saveto: 376 saveto = options.saveto 377 else: 378 saveto = '_CSSCapture_SAVED' 379 c.saveto(saveto, saveraw=options.saveraw, minified=options.minified) 380 else: 381 for i, s in enumerate(stylesheetlist): 382 print i+1, u'\ttitle: "%s", \n\thref : "%s"\n' % (s.title, s.href)
383 384 385 if __name__ == "__main__": 386 sys.exit(main()) 387