1
2 """Retrieve all CSS stylesheets including embedded for a given URL.
3 Retrieve as StyleSheetList or save to disk - raw, parsed or minified version.
4
5 TODO:
6 - maybe use DOM 3 load/save?
7 - logger class which handles all cases when no log is given...
8 - saveto: why does urllib2 hang?
9 """
10 __all__ = ['CSSCapture']
11 __docformat__ = 'restructuredtext'
12 __author__ = '$LastChangedBy: cthedot $'
13 __date__ = '$LastChangedDate: 2007-11-05 22:34:16 +0100 (Mo, 05 Nov 2007) $'
14 __version__ = '$LastChangedRevision: 647 $'
15
16 import codecs
17 import errno
18 import HTMLParser
19 import logging
20 import os
21 import sys
22 import urllib2
23 import urlparse
24
25 import cssutils
26 try:
27 import encutils
28 except ImportError:
29 try:
30 import cssutils.encutils as encutils
31 except ImportError:
32 sys.exit("You need encutils from http://cthedot.de/encutils/")
33
35 """ parses given data for link and style elements """
36 curtag = u''
37 links = []
38
39 styles = []
40
41
43 return dict([(a.lower(), v.lower()) for a, v in attrs])
44
46 if tag == u'link':
47 attrs = self._lowerattrs(attrs)
48 if attrs.get(u'type', u'') == u'text/css':
49 self.links.append(attrs)
50
51 elif tag == u'style':
52 attrs = self._lowerattrs(attrs)
53 if attrs.get(u'type', u'') == u'text/css':
54 self.styles.append((attrs, u''))
55 self.curtag = tag
56 else:
57
58 self.curtag = u''
59
63
67
71
73 """
74 Retrieve all CSS stylesheets including embedded for a given URL.
75 Optional setting of User-Agent used for retrieval possible
76 to handle browser sniffing servers.
77
78 raises urllib2.HTTPError
79 """
80 - def __init__(self, ua=None, log=None, defaultloglevel=logging.INFO):
81 """
82 initialize a new Capture object
83
84 ua
85 init User-Agent to use for requests
86 log
87 supply a log object which is used instead of the default
88 log which writes to sys.stderr
89 defaultloglevel
90 constant of logging package which defines the level of the
91 default log if no explicit log given
92 """
93 self._ua = ua
94 self._parser = CSSCaptureHTMLParser()
95
96 if log:
97 self._log = log
98 else:
99 self._log = logging.getLogger('CSSCapture')
100 hdlr = logging.StreamHandler(sys.stderr)
101 formatter = logging.Formatter('%(message)s')
102 hdlr.setFormatter(formatter)
103 self._log.addHandler(hdlr)
104 self._log.setLevel(defaultloglevel)
105 self._log.debug(u'Using default log')
106
108 """
109 Does an HTTP request
110
111 Returns: (response, url)
112
113 url might have been changed by server due to redirects etc
114 """
115 self._log.debug(u' CSSCapture._doRequest\n * URL: %s' % url)
116
117 req = urllib2.Request(url)
118 if self._ua:
119 req.add_header('User-agent', self._ua)
120 self._log.info(' * Using User-Agent: %s', self._ua)
121 try:
122 res = urllib2.urlopen(req)
123 except urllib2.HTTPError, e:
124 self._log.critical(' %s\n%s %s\n%s' % (
125 e.geturl(), e.code, e.msg, e.headers))
126 return None, None
127
128
129 if url != res.geturl():
130 url = res.geturl()
131 self._log.info(' URL retrieved: %s', url)
132
133 return res, url
134
135 - def _createStyleSheet(self, href=None,
136 media=None,
137 parentStyleSheet=None,
138 title=u'',
139 cssText=None,
140 encoding=None):
141 """
142 returns CSSStyleSheet read from href or if cssText is given use that
143
144 encoding
145 used if inline style found, same as self.docencoding
146 """
147 if not cssText:
148 res, href = self._doRequest(href)
149 if res:
150 if not encoding:
151 media_type, encoding = encutils.getHTTPInfo(res)
152 if media_type != u'text/css':
153 self._log.warn(u' WARNING: HTTP media type is different than expected "text/css": %r' %
154 media_type)
155 try:
156 cssText = codecs.getreader('css')(res,
157 encoding=encoding).read()
158 except UnicodeDecodeError, e:
159 self._log.error(u' Error retrieving CSS, probably encoding mismatch:\n\t%s\n\t%s'
160 % (href, e))
161 return None
162 else:
163 self._log.error(u' ERROR accessing CSS\n\t' % href)
164 return None
165
166 sheet = cssutils.parseString(cssText)
167 sheet.href = href
168 sheet.media = media
169 sheet.parentStyleSheet = parentStyleSheet
170 sheet.title = title
171 self._log.debug(u' * title: %s', title)
172 if href:
173 self._log.info(u' * href : %s', href)
174 self._log.info(u' * media: %s', media.mediaText)
175 self._log.info(u' %s\n' % sheet)
176 self._log.debug(u' * cssText:\n%s\n', cssText)
177
178 self._nonparsed[sheet] = cssText
179 return sheet
180
181 - def _doImports(self, parentStyleSheet, baseurl=None):
182 """
183 handle all @import CSS stylesheet recusively
184 found CSS stylesheets are appended to stylesheetlist
185 """
186 for rule in parentStyleSheet.cssRules:
187 if rule.type == rule.IMPORT_RULE:
188 self._log.info(u'\n@import FOUND -----')
189 self._log.debug(u' IN: %s\n' % parentStyleSheet)
190 href = urlparse.urljoin(baseurl, rule.href)
191 sheet = self._createStyleSheet(
192 href=href,
193 media=rule.media,
194 parentStyleSheet=parentStyleSheet)
195 if sheet:
196 self.stylesheetlist.append(sheet)
197 self._doImports(sheet, baseurl=href)
198
200 """
201 parse text for stylesheets
202 fills stylesheetlist with all found StyleSheets
203
204 docurl
205 to build a full url of found StyleSheets @href
206 doctext
207 to parse
208 """
209 self._parser.feed(doctext)
210
211 for link in self._parser.links:
212 self._log.info(u'\n<link> FOUND -----')
213 self._log.debug(u' %s\n' % link)
214 href = urlparse.urljoin(docurl, link.get(u'href', u''))
215 sheet = self._createStyleSheet(
216 href=href,
217 media=cssutils.stylesheets.MediaList(
218 link.get(u'media', u'')),
219 title=link.get(u'title', u''))
220 if sheet:
221 self.stylesheetlist.append(sheet)
222 self._doImports(sheet, baseurl=href)
223
224
225
226
227 for style in self._parser.styles:
228 stylemeta, cssText = style
229 self._log.info(u'\n<style> FOUND -----' )
230 self._log.debug(u' %s\n' % stylemeta)
231 sheet = self._createStyleSheet(
232 media=cssutils.stylesheets.MediaList(
233 stylemeta.get(u'media', u'')),
234 title=stylemeta.get(u'title', u''),
235 cssText=cssText,
236 encoding=self.docencoding)
237 if sheet:
238 self.stylesheetlist.append(sheet)
239 self._doImports(sheet, baseurl=docurl)
240
242 """
243 Capture stylesheets for the given url, any HTTPError is raised to
244 caller.
245
246 url
247 to capture CSS from
248 ua
249 User-Agent to use for requests
250
251 Returns StyleSheetList.
252 """
253 if ua is not None:
254 self._ua = ua
255
256 self._log.info(u'\nCapturing CSS from URL: %s\n', url)
257 self.stylesheetlist = cssutils.stylesheets.StyleSheetList()
258
259
260 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
261 self._filename = os.path.basename(path)
262
263
264 res, url = self._doRequest(url)
265 if not res:
266 sys.exit(1)
267 rawdoc = res.read()
268
269 self.docencoding = encutils.getEncodingInfo(
270 res, rawdoc, log=self._log).encoding
271 self._log.info(u'\nUsing Encoding: %s\n', self.docencoding)
272
273 doctext = unicode(rawdoc, self.docencoding)
274
275
276 self._nonparsed = {}
277 self._findStyleSheets(url, doctext)
278
279 return self.stylesheetlist
280
281 - def saveto(self, dir, saveraw=False, minified=False):
282 """
283 saves css in "dir" in the same layout as on the server
284 internal stylesheets are saved as "dir/__INLINE_STYLE__.html.css"
285
286 dir
287 directory to save files to
288 saveparsed
289 save literal CSS from server or save the parsed CSS
290 minified
291 save minified CSS
292
293 Both parsed and minified (which is also parsed of course) will
294 loose information which cssutils is unable to understand or where
295 it is simple buggy. You might to first save the raw version before
296 parsing of even minifying it.
297 """
298 msg = 'parsed'
299 if saveraw:
300 msg = 'raw'
301 if minified:
302 cssutils.ser.prefs.useMinified()
303 msg = 'minified'
304
305 inlines = 0
306 for sheet in self.stylesheetlist:
307 url = sheet.href
308 if not url:
309 inlines += 1
310 url = '%s_INLINE_%s.css' % (
311 self._filename, inlines)
312
313
314 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
315
316 if path and path.startswith('/'):
317 path = path[1:]
318 path = os.path.normpath(path)
319 path, fn = os.path.split(path)
320 savepath = os.path.join(dir, loc, path)
321 savefn = os.path.join(savepath, fn)
322 try:
323 os.makedirs(savepath)
324 except OSError, e:
325 if e.errno != errno.EEXIST:
326 raise e
327 self._log.debug(u'Path "%s" already exists.', savepath)
328
329 if saveraw:
330 cssText = self._nonparsed[sheet]
331 else:
332 cssText = sheet.cssText
333
334 self._log.info(u'Saving %s "%s"' % (msg, savefn))
335 sf = open(savefn, 'wb')
336 uf = codecs.getwriter('css')(sf)
337 uf.write(cssText)
338 sf.close()
339
340 -def main(args=None):
341 import optparse
342
343 usage = "usage: %prog [options] URL"
344 parser = optparse.OptionParser(usage=usage)
345 parser.add_option('-d', '--debug', action='store_true', dest='debug',
346 help='show debug messages during capturing')
347 parser.add_option('-m', '--minified', action='store_true', dest='minified',
348 help='saves minified version of captured files')
349 parser.add_option('-n', '--notsave', action='store_true', dest='notsave',
350 help='if given files are NOT saved, only log is written')
351 parser.add_option('-r', '--saveraw', action='store_true', dest='saveraw',
352 help='if given saves raw css otherwise cssutils\' parsed files')
353 parser.add_option('-s', '--saveto', action='store', dest='saveto',
354 help='saving retrieved files to "saveto", defaults to "_CSSCapture_SAVED"')
355 parser.add_option('-u', '--useragent', action='store', dest='ua',
356 help='useragent to use for request of URL, default is urllib2s default')
357 options, url = parser.parse_args()
358
359 if not url:
360 parser.error('no URL given')
361 else:
362 url = url[0]
363
364 if options.debug:
365 dll = logging.DEBUG
366 else:
367 dll = logging.INFO
368
369
370 c = CSSCapture(defaultloglevel=dll)
371
372 stylesheetlist = c.capture(url, ua=options.ua)
373
374 if options.notsave is None or not options.notsave:
375 if options.saveto:
376 saveto = options.saveto
377 else:
378 saveto = '_CSSCapture_SAVED'
379 c.saveto(saveto, saveraw=options.saveraw, minified=options.minified)
380 else:
381 for i, s in enumerate(stylesheetlist):
382 print i+1, u'\ttitle: "%s", \n\thref : "%s"\n' % (s.title, s.href)
383
384
385 if __name__ == "__main__":
386 sys.exit(main())
387