Package cssutils :: Module script
[hide private]
[frames] | no frames]

Source Code for Module cssutils.script

  1  """classes and functions used by cssutils scripts 
  2  """ 
  3  __all__ = ['CSSCapture', 'csscombine'] 
  4  __docformat__ = 'restructuredtext' 
  5  __version__ = '$Id: parse.py 1323 2008-07-06 18:13:57Z cthedot $' 
  6   
  7  import codecs 
  8  import errno 
  9  import HTMLParser 
 10  import logging 
 11  import os 
 12  import sys 
 13  import urllib2 
 14  import urlparse 
 15   
 16  import cssutils 
 17  try: 
 18      import cssutils.encutils as encutils 
 19  except ImportError: 
 20      try: 
 21          import encutils 
 22      except ImportError: 
 23          sys.exit("You need encutils from http://cthedot.de/encutils/") 
 24   
 25  # types of sheets in HTML 
 26  LINK = 0 # <link rel="stylesheet" type="text/css" href="..." [@title="..." @media="..."]/> 
 27  STYLE = 1 # <style type="text/css" [@title="..."]>...</style> 
 28   
29 -class CSSCaptureHTMLParser(HTMLParser.HTMLParser):
30 """CSSCapture helper: Parse given data for link and style elements""" 31 curtag = u'' 32 sheets = [] # (type, [atts, cssText]) 33
34 - def _loweratts(self, atts):
35 return dict([(a.lower(), v.lower()) for a, v in atts])
36
37 - def handle_starttag(self, tag, atts):
38 if tag == u'link': 39 atts = self._loweratts(atts) 40 if u'text/css' == atts.get(u'type', u''): 41 self.sheets.append((LINK, atts)) 42 elif tag == u'style': 43 # also get content of style 44 atts = self._loweratts(atts) 45 if u'text/css' == atts.get(u'type', u''): 46 self.sheets.append((STYLE, [atts, u''])) 47 self.curtag = tag 48 else: 49 # close as only intersting <style> cannot contain any elements 50 self.curtag = u''
51
52 - def handle_data(self, data):
53 if self.curtag == u'style': 54 self.sheets[-1][1][1] = data # replace cssText
55
56 - def handle_comment(self, data):
57 # style might have comment content, treat same as data 58 self.handle_data(data)
59
60 - def handle_endtag(self, tag):
61 # close as style cannot contain any elements 62 self.curtag = u''
63 64
65 -class CSSCapture(object):
66 """ 67 Retrieve all CSS stylesheets including embedded for a given URL. 68 Optional setting of User-Agent used for retrieval possible 69 to handle browser sniffing servers. 70 71 raises urllib2.HTTPError 72 """
73 - def __init__(self, ua=None, log=None, defaultloglevel=logging.INFO):
74 """ 75 initialize a new Capture object 76 77 ua 78 init User-Agent to use for requests 79 log 80 supply a log object which is used instead of the default 81 log which writes to sys.stderr 82 defaultloglevel 83 constant of logging package which defines the level of the 84 default log if no explicit log given 85 """ 86 self._ua = ua 87 88 if log: 89 self._log = log 90 else: 91 self._log = logging.getLogger('CSSCapture') 92 hdlr = logging.StreamHandler(sys.stderr) 93 formatter = logging.Formatter('%(message)s') 94 hdlr.setFormatter(formatter) 95 self._log.addHandler(hdlr) 96 self._log.setLevel(defaultloglevel) 97 self._log.debug(u'Using default log') 98 99 self._htmlparser = CSSCaptureHTMLParser() 100 self._cssparser = cssutils.CSSParser(log = self._log)
101
102 - def _doRequest(self, url):
103 """Do an HTTP request 104 105 Return (url, rawcontent) 106 url might have been changed by server due to redirects etc 107 """ 108 self._log.debug(u' CSSCapture._doRequest\n * URL: %s' % url) 109 110 req = urllib2.Request(url) 111 if self._ua: 112 req.add_header('User-agent', self._ua) 113 self._log.info(' * Using User-Agent: %s', self._ua) 114 115 try: 116 res = urllib2.urlopen(req) 117 except urllib2.HTTPError, e: 118 self._log.critical(' %s\n%s %s\n%s' % ( 119 e.geturl(), e.code, e.msg, e.headers)) 120 return None, None 121 122 # get real url 123 if url != res.geturl(): 124 url = res.geturl() 125 self._log.info(' URL retrieved: %s', url) 126 127 return url, res
128
129 - def _createStyleSheet(self, href=None, 130 media=None, 131 parentStyleSheet=None, 132 title=u'', 133 cssText=None, 134 encoding=None):
135 """ 136 Return CSSStyleSheet read from href or if cssText is given use that. 137 138 encoding 139 used if inline style found, same as self.docencoding 140 """ 141 if cssText is None: 142 encoding, cssText = cssutils.util._readUrl(href, parentEncoding=self.docencoding) 143 encoding = None # already decoded??? 144 145 sheet = self._cssparser.parseString(cssText, href=href, media=media, title=title, 146 encoding=encoding) 147 148 if not sheet: 149 return None 150 151 else: 152 self._log.info(u' %s\n' % sheet) 153 self._nonparsed[sheet] = cssText 154 return sheet
155
156 - def _findStyleSheets(self, docurl, doctext):
157 """ 158 parse text for stylesheets 159 fills stylesheetlist with all found StyleSheets 160 161 docurl 162 to build a full url of found StyleSheets @href 163 doctext 164 to parse 165 """ 166 # TODO: ownerNode should be set to the <link> node 167 self._htmlparser.feed(doctext) 168 169 for typ, data in self._htmlparser.sheets: 170 sheet = None 171 172 if LINK == typ: 173 self._log.info(u'+ PROCESSING <link> %r' % data) 174 175 atts = data 176 href = urlparse.urljoin(docurl, atts.get(u'href', None)) 177 sheet = self._createStyleSheet(href=href, 178 media=atts.get(u'media', None), 179 title=atts.get(u'title', None)) 180 elif STYLE == typ: 181 self._log.info(u'+ PROCESSING <style> %r' % data) 182 183 atts, cssText = data 184 sheet = self._createStyleSheet(cssText=cssText, 185 href = docurl, 186 media=atts.get(u'media', None), 187 title=atts.get(u'title', None), 188 encoding=self.docencoding) 189 if sheet: 190 sheet._href = None # inline have no href! 191 print sheet.cssText 192 193 if sheet: 194 self.stylesheetlist.append(sheet) 195 self._doImports(sheet, base=docurl)
196 197
198 - def _doImports(self, parentStyleSheet, base=None):
199 """ 200 handle all @import CSS stylesheet recursively 201 found CSS stylesheets are appended to stylesheetlist 202 """ 203 # TODO: only if not parsed these have to be read extra! 204 205 for rule in parentStyleSheet.cssRules: 206 if rule.type == rule.IMPORT_RULE: 207 self._log.info(u'+ PROCESSING @import:') 208 self._log.debug(u' IN: %s\n' % parentStyleSheet.href) 209 sheet = rule.styleSheet 210 href = urlparse.urljoin(base, rule.href) 211 if sheet: 212 self._log.info(u' %s\n' % sheet) 213 self.stylesheetlist.append(sheet) 214 self._doImports(sheet, base=href)
215
216 - def capture(self, url):
217 """ 218 Capture all stylesheets at given URL's HTML document. 219 Any HTTPError is raised to caller. 220 221 url 222 to capture CSS from 223 224 Returns ``cssutils.stylesheets.StyleSheetList``. 225 """ 226 self._log.info(u'\nCapturing CSS from URL:\n %s\n', url) 227 self._nonparsed = {} 228 self.stylesheetlist = cssutils.stylesheets.StyleSheetList() 229 230 # used to save inline styles 231 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 232 self._filename = os.path.basename(path) 233 234 # get url content 235 url, res = self._doRequest(url) 236 if not res: 237 sys.exit(1) 238 239 rawdoc = res.read() 240 241 self.docencoding = encutils.getEncodingInfo( 242 res, rawdoc, log=self._log).encoding 243 self._log.info(u'\nUsing Encoding: %s\n', self.docencoding) 244 245 doctext = rawdoc.decode(self.docencoding) 246 247 # fill list of stylesheets and list of raw css 248 self._findStyleSheets(url, doctext) 249 250 return self.stylesheetlist
251
252 - def saveto(self, dir, saveraw=False, minified=False):
253 """ 254 saves css in "dir" in the same layout as on the server 255 internal stylesheets are saved as "dir/__INLINE_STYLE__.html.css" 256 257 dir 258 directory to save files to 259 saveparsed 260 save literal CSS from server or save the parsed CSS 261 minified 262 save minified CSS 263 264 Both parsed and minified (which is also parsed of course) will 265 loose information which cssutils is unable to understand or where 266 it is simple buggy. You might to first save the raw version before 267 parsing of even minifying it. 268 """ 269 msg = 'parsed' 270 if saveraw: 271 msg = 'raw' 272 if minified: 273 cssutils.ser.prefs.useMinified() 274 msg = 'minified' 275 276 inlines = 0 277 for i, sheet in enumerate(self.stylesheetlist): 278 url = sheet.href 279 if not url: 280 inlines += 1 281 url = u'%s_INLINE_%s.css' % (self._filename, inlines) 282 283 # build savepath 284 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 285 # no absolute path 286 if path and path.startswith('/'): 287 path = path[1:] 288 path = os.path.normpath(path) 289 path, fn = os.path.split(path) 290 savepath = os.path.join(dir, path) 291 savefn = os.path.join(savepath, fn) 292 try: 293 os.makedirs(savepath) 294 except OSError, e: 295 if e.errno != errno.EEXIST: 296 raise e 297 self._log.debug(u'Path "%s" already exists.', savepath) 298 299 self._log.info(u'SAVING %s, %s %r' % (i+1, msg, savefn)) 300 301 sf = open(savefn, 'wb') 302 if saveraw: 303 cssText = self._nonparsed[sheet] 304 uf = codecs.getwriter('css')(sf) 305 uf.write(cssText) 306 else: 307 sf.write(sheet.cssText) 308 sf.close()
309 310
311 -def csscombine(proxypath, sourceencoding=None, targetencoding='utf-8', 312 minify=True):
313 """Combine sheets referred to by @import rules in given CSS proxy sheet 314 into a single new sheet. 315 316 :returns: combined cssText, normal or minified 317 :Parameters: 318 `proxypath` 319 url or path to a CSSStyleSheet which imports other sheets which 320 are then combined into one sheet 321 `sourceencoding` 322 encoding of the source sheets including the proxy sheet 323 `targetencoding` 324 encoding of the combined stylesheet, default 'utf-8' 325 `minify` 326 defines if the combined sheet should be minified, default True 327 """ 328 sys.stderr.write('COMBINING %s\n' % proxypath) 329 330 if sourceencoding is not None: 331 sys.stderr.write('USING SOURCE ENCODING: %s\n' % sourceencoding) 332 333 src = cssutils.parseFile(proxypath, encoding=sourceencoding) 334 srcpath = os.path.dirname(proxypath) 335 combined = cssutils.css.CSSStyleSheet() 336 for rule in src.cssRules: 337 if rule.type == rule.IMPORT_RULE: 338 fn = os.path.join(srcpath, rule.href) 339 sys.stderr.write('* PROCESSING @import %s\n' % fn) 340 importsheet = cssutils.parseFile(fn, encoding=sourceencoding) 341 importsheet.encoding = None # remove @charset 342 combined.add(cssutils.css.CSSComment(cssText=u'/* %s */' % 343 rule.cssText)) 344 for x in importsheet.cssRules: 345 if x.type == x.IMPORT_RULE: 346 sys.stderr.write('INFO\tNested @imports are not combined: %s\n' % x.cssText) 347 348 combined.add(x) 349 350 else: 351 combined.add(rule) 352 353 sys.stderr.write('SETTING TARGET ENCODING: %s\n' % targetencoding) 354 combined.encoding = targetencoding 355 356 if minify: 357 # save old setting and use own serializer 358 oldser = cssutils.ser 359 cssutils.setSerializer(cssutils.serialize.CSSSerializer()) 360 cssutils.ser.prefs.useMinified() 361 cssText = combined.cssText 362 cssutils.setSerializer(oldser) 363 else: 364 cssText = combined.cssText 365 366 return cssText
367