Package cssutils :: Package scripts :: Module csscapture
[hide private]
[frames] | no frames]

Source Code for Module cssutils.scripts.csscapture

  1  #!/usr/bin/env python 
  2  """ 
  3  Retrieve all CSS stylesheets including embedded for a given URL. 
  4  Retrieve as StyleSheetList or save to disk. 
  5   
  6  TODO: 
  7      @import 
  8      save all 
  9   
 10      maybe use DOM 3 load/save? 
 11   
 12      logger class which handles all cases when no log is given... 
 13   
 14      saveto: 
 15          why does urllib2 hang? 
 16  """ 
 17  __all__ = ['CSSCapture'] 
 18  __docformat__ = 'restructuredtext' 
 19  __author__ = '$LastChangedBy: doerwalter $' 
 20  __date__ = '$LastChangedDate: 2007-08-21 17:18:10 +0200 (Di, 21 Aug 2007) $' 
 21  __version__ = '0.9.2a1, $LastChangedRevision: 263 $' 
 22   
 23  import errno 
 24  import HTMLParser 
 25  import logging 
 26  import os 
 27  import sys 
 28  import urllib 
 29  import urllib2 
 30  import urlparse 
 31  import xml.dom 
 32   
 33  import cssutils 
 34  from cssutils import css, stylesheets 
 35   
 36  try: 
 37      import encutils 
 38  except ImportError: 
 39      try: 
 40          import cssutils.encutils as encutils 
 41      except ImportError: 
 42          sys.exit("You need encutils from http://cthedot.de/encutils/") 
 43   
 44   
45 -class CSSCaptureHTMLParser(HTMLParser.HTMLParser):
46 """ parses given data for link and style elements """ 47 curtag = u'' 48 links = [] 49 # list of attrsdict 50 styles = [] 51 # list of (attrsdict, data) 52
53 - def _lowerattrs(self, attrs):
54 return dict([(a.lower(), v.lower()) for a, v in attrs])
55
56 - def handle_starttag(self, tag, attrs):
57 if tag == u'link': 58 attrs = self._lowerattrs(attrs) 59 if attrs.get(u'type', u'') == u'text/css': 60 self.links.append(attrs) 61 # also get content of tag 62 elif tag == u'style': 63 attrs = self._lowerattrs(attrs) 64 if attrs.get(u'type', u'') == u'text/css': 65 self.styles.append((attrs, u'')) 66 self.curtag = tag 67 else: 68 # close as style cannot contain any elements 69 self.curtag = u''
70
71 - def handle_data(self, data):
72 if self.curtag == u'style': 73 self.styles[-1] = (self.styles[-1][0], data)
74
75 - def handle_comment(self, data):
76 # style might have comment content, treat same as data 77 self.handle_data(data)
78
79 - def handle_endtag(self, tag):
80 # close as style cannot contain any elements 81 self.curtag = u''
82 83
84 -class CSSCapture(object):
85 """ 86 Retrieve all CSS stylesheets including embedded for a given URL. 87 Optional setting of User-Agent used for retrieval possible 88 to handle browser sniffing servers. 89 90 raises urllib2.HTTPError 91 """ 92
93 - def __init__(self, ua=None, log=None, defaultloglevel=logging.INFO):
94 """ 95 initialize a new Capture object 96 97 ua 98 init User-Agent to use for requests 99 log 100 supply a log object which is used instead of the default 101 log which writes to sys.stderr 102 defaultloglevel 103 constant of logging package which defines the level of the 104 default log if no explicit log given 105 """ 106 self._ua = ua 107 self._parser = CSSCaptureHTMLParser() 108 109 if log: 110 self._log = log 111 else: 112 self._log = logging.getLogger('CSSCapture') 113 hdlr = logging.StreamHandler(sys.stderr) 114 formatter = logging.Formatter('%(levelname)s\t%(message)s') 115 hdlr.setFormatter(formatter) 116 self._log.addHandler(hdlr) 117 self._log.setLevel(defaultloglevel) 118 self._log.debug(u'(C) Using default log')
119 120
121 - def _doRequest(self, url):
122 """ 123 Does an HTTP request 124 125 Returns: (response, url) 126 127 url might have been changed by server due to redirects etc 128 """ 129 self._log.debug(u'(C) _doRequest URL: %s' % url) 130 131 req = urllib2.Request(url) 132 if self._ua: 133 req.add_header('User-agent', self._ua) 134 self._log.info('(C) Using User-Agent: %s', self._ua) 135 try: 136 res = urllib2.urlopen(req) 137 except urllib2.HTTPError, e: 138 self._log.critical('(C) %s\n%s %s\n%s' % ( 139 e.geturl(), e.code, e.msg, e.headers)) 140 return None, None 141 142 # get real url 143 if url != res.geturl(): 144 url = res.geturl() 145 self._log.info('(C) URL retrieved: %s', url) 146 147 return res, url
148 149
150 - def _doImports(self, parentStyleSheet, baseurl=None):
151 """ 152 handle all @import CSS stylesheet recusively 153 found CSS stylesheets are appended to stylesheetlist 154 """ 155 for rule in parentStyleSheet.cssRules: 156 if rule.type == css.CSSRule.IMPORT_RULE: 157 158 href = urlparse.urljoin(baseurl, rule.href) 159 media = rule.media 160 res, href = self._doRequest(href) 161 if not res: 162 continue 163 cssText = res.read() 164 sheet = css.CSSStyleSheet( 165 href=href, 166 media=media, 167 parentStyleSheet=parentStyleSheet 168 ) 169 self.stylesheetlist.append(sheet) 170 171 self._log.info( 172 '(C) - FOUND @import in: %s' % parentStyleSheet) 173 self._log.info('(C) * stylesheet : %s' % sheet) 174 self._log.info('(C) * full href : %s', href) 175 self._log.info('(C) * media : %s', media.mediaText) 176 self._log.debug('(C) * cssText :\n%s', cssText) 177 178 try: 179 sheet.cssText = cssText 180 except xml.dom.DOMException, e: 181 self._log.warn('(C) * CSSParser message:\n%s' % e) 182 self._doImports(sheet, baseurl=href)
183 184
185 - def _findStyleSheets(self, docurl, doctext):
186 """ 187 parse text for stylesheets 188 fills stylesheetlist with all found StyleSheets 189 190 docurl 191 to build a full url of found StyleSheets @href 192 doctext 193 to parse 194 """ 195 self._parser.feed(doctext) 196 197 # <link>ed stylesheets 198 # ownerNode should be set to the <link> node 199 for link in self._parser.links: 200 201 href = urlparse.urljoin(docurl, link.get(u'href', u'')) 202 media = stylesheets.MediaList(link.get(u'media', u'')) 203 res, href = self._doRequest(href) 204 if not res: 205 continue 206 cssText = res.read() 207 sheet = css.CSSStyleSheet( 208 href=href, 209 media=media, 210 title=link.get(u'title', u''), 211 ) 212 self.stylesheetlist.append(sheet) 213 214 self._log.info('(C) - FOUND <link>: %s', link) 215 self._log.info('(C) * stylesheet: %s' % sheet) 216 self._log.info('(C) * full href : %s', href) 217 self._log.info('(C) * media : %s', media.mediaText) 218 self._log.debug('(C) * cssText :\n%s', cssText) 219 220 try: 221 sheet.cssText = cssText 222 except xml.dom.DOMException, e: 223 self._log.warn('(C) * CSSParser message:\n%s' % e) 224 self._doImports(sheet, baseurl=docurl) 225 226 # internal <style>sheets 227 # href is None for internal stylesheets 228 # ownerNode should be set to the <style> node 229 for style in self._parser.styles: 230 231 stylemeta, cssText = style 232 media = stylesheets.MediaList(stylemeta.get(u'media', u'')) 233 sheet = css.CSSStyleSheet( 234 href=None, 235 media=media, 236 title=stylemeta.get(u'title', u''), 237 ) 238 self.stylesheetlist.append(sheet) 239 240 self._log.info('(C) - FOUND <style>: %s', stylemeta) 241 self._log.info('(C) * stylesheet : %s' % sheet) 242 self._log.info('(C) * media : %s', media.mediaText) 243 self._log.debug('(C) * cssText :\n%s', cssText) 244 245 try: 246 sheet.cssText = cssText 247 except xml.dom.DOMException, e: 248 self._log.warn('(C) * CSSParser message:\n%s' % e) 249 self._doImports(sheet, baseurl=docurl)
250 251
252 - def capture(self, url, ua=None):
253 """ 254 Capture stylesheets for the given url, any HTTPError is raised to 255 caller. 256 257 url 258 to capture CSS from 259 ua 260 User-Agent to use for requests 261 262 Returns StyleSheetList. 263 """ 264 if ua is not None: 265 self._ua = ua 266 267 # used to save inline styles 268 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 269 self._filename = os.path.basename(path) 270 271 self.stylesheetlist = stylesheets.StyleSheetList() 272 273 self._log.debug('(C) CSSCapture.capture(%s)' % url) 274 self._log.info('(C) URL supplied: %s', url) 275 276 # get url content 277 res, url = self._doRequest(url) 278 if not res: 279 sys.exit(1) 280 rawdoc = res.read() 281 282 encoding = encutils.getEncodingInfo( 283 res, rawdoc, log=self._log).encoding 284 self._log.info('(C) Using Encoding: %s', encoding) 285 286 doctext = unicode(rawdoc, encoding) 287 288 # fill list of stylesheets 289 self._findStyleSheets(url, doctext) 290 291 return self.stylesheetlist
292 293
294 - def saveto(self, dir, saveparsed=False):
295 """ 296 saves css in "dir" in the same layout as on the server 297 internal stylesheets are saved as "dir/__INLINE_STYLE__.html.css" 298 299 dir 300 directory to save files to 301 saveparsed 302 use literal CSS from server or use the parsed version 303 304 you may want to use the server version until CSSParser is more 305 stable or if you want to keep the stylesheet exactly as is 306 """ 307 inlines = 0 308 for sheet in self.stylesheetlist: 309 310 url = sheet.href 311 if not url: 312 url = '%s_INLINE_%s.css' % ( 313 self._filename, inlines) 314 inlines += 1 315 316 #if saveparsed: 317 cssutils.ser.prefs.keepAllProperties=True 318 cssText = sheet.cssText 319 #else: 320 # cssText = sheet.literalCssText 321 322 # build savepath 323 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 324 # no absolute path 325 if path and path.startswith('/'): 326 path = path[1:] 327 path = os.path.normpath(path) 328 path, fn = os.path.split(path) 329 330 savepath = os.path.join(dir, loc, path) 331 savefn = os.path.join(savepath, fn) 332 333 try: 334 os.makedirs(savepath) 335 except OSError, e: 336 if e.errno != errno.EEXIST: 337 raise e 338 self._log.debug('(C) Path "%s" already exists.', savepath) 339 340 open(savefn, 'w').write(cssText) 341 self._log.info('(C) Saving "%s"', savefn)
342 343
344 -def main(args=None):
345 import optparse 346 347 usage = "usage: %prog [options] URL" 348 parser = optparse.OptionParser(usage=usage) 349 parser.add_option('-u', '--useragent', action='store', dest='ua', 350 help='useragent to use for request of URL, default is urllib2s default') 351 parser.add_option('-s', '--saveto', action='store', dest='saveto', 352 help='saving retrieved files to "saveto", default to "_CSSCapture_SAVED"') 353 parser.add_option('-p', '--saveparsed', action='store_true', dest='saveparsed', 354 help='if given saves cssutils\' parsed files, otherwise original retrieved files') 355 parser.add_option('-n', '--notsave', action='store_true', dest='notsave', 356 help='if given files are NOT saved, only log is written') 357 parser.add_option('-d', '--debug', action='store_true', dest='debug', 358 help='show debug messages during capturing') 359 options, url = parser.parse_args() 360 361 if not url: 362 parser.error('no URL given') 363 else: 364 url = url[0] 365 366 if options.debug: 367 dll = logging.DEBUG 368 else: 369 dll = logging.INFO 370 371 # START 372 c = CSSCapture(defaultloglevel=dll) 373 374 stylesheetlist = c.capture(url, ua=options.ua) 375 376 if options.notsave is None or not options.notsave: 377 if options.saveto: 378 saveto = options.saveto 379 else: 380 saveto = '_CSSCapture_SAVED' 381 c.saveto(saveto, saveparsed=options.saveparsed) 382 else: 383 for i, s in enumerate(stylesheetlist): 384 print i+1, '\tTitle: "%s", \n\thref: "%s"\n' % (s.title, s.href)
385 386 387 if __name__ == "__main__": 388 sys.exit(main()) 389