Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2import re 

3import subprocess 

4import sys 

5from .source import Source 

6from .configuration import Configuration 

7import io 

8import codecs 

9try: 

10 # Python 2.x and 3.x support for checking string types 

11 assert basestring 

12except NameError: 

13 basestring = str 

14 

15 

16class PDFKit(object): 

17 """ 

18 Main class that does all generation routine. 

19 

20 :param url_or_file: str - either a URL, a path to a file or a string containing HTML 

21 to convert 

22 :param type_: str - either 'url', 'file' or 'string' 

23 :param options: dict (optional) with wkhtmltopdf options, with or w/o '--' 

24 :param toc: dict (optional) - toc-specific wkhtmltopdf options, with or w/o '--' 

25 :param cover: str (optional) - url/filename with a cover html page 

26 :param configuration: (optional) instance of pdfkit.configuration.Configuration() 

27 """ 

28 

29 class ImproperSourceError(Exception): 

30 """Wrong source type for stylesheets""" 

31 

32 def __init__(self, msg): 

33 self.msg = msg 

34 

35 def __str__(self): 

36 return self.msg 

37 

38 def __init__(self, url_or_file, type_, options=None, toc=None, cover=None, 

39 css=None, configuration=None, cover_first=False): 

40 

41 self.source = Source(url_or_file, type_) 

42 self.configuration = (Configuration() if configuration is None 

43 else configuration) 

44 try: 

45 self.wkhtmltopdf = self.configuration.wkhtmltopdf.decode('utf-8') 

46 except AttributeError: 

47 self.wkhtmltopdf = self.configuration.wkhtmltopdf 

48 

49 self.options = dict() 

50 if self.source.isString(): 

51 self.options.update(self._find_options_in_meta(url_or_file)) 

52 

53 if options is not None: self.options.update(options) 

54 

55 self.toc = {} if toc is None else toc 

56 self.cover = cover 

57 self.cover_first = cover_first 

58 self.css = css 

59 self.stylesheets = [] 

60 

61 def _genargs(self, opts): 

62 """ 

63 Generator of args parts based on options specification. 

64 

65 Note: Empty parts will be filtered out at _command generator 

66 """ 

67 for optkey, optval in self._normalize_options(opts): 

68 yield optkey 

69 

70 if isinstance(optval, (list, tuple)): 

71 assert len(optval) == 2 and optval[0] and optval[1], 'Option value can only be either a string or a (tuple, list) of 2 items' 

72 yield optval[0] 

73 yield optval[1] 

74 else: 

75 yield optval 

76 

77 def _command(self, path=None): 

78 """ 

79 Generator of all command parts 

80 """ 

81 if self.css: 

82 self._prepend_css(self.css) 

83 

84 yield self.wkhtmltopdf 

85 

86 for argpart in self._genargs(self.options): 

87 if argpart: 

88 yield argpart 

89 

90 if self.cover and self.cover_first: 

91 yield 'cover' 

92 yield self.cover 

93 

94 if self.toc: 

95 yield 'toc' 

96 for argpart in self._genargs(self.toc): 

97 if argpart: 

98 yield argpart 

99 

100 if self.cover and not self.cover_first: 

101 yield 'cover' 

102 yield self.cover 

103 

104 # If the source is a string then we will pipe it into wkhtmltopdf 

105 # If the source is file-like then we will read from it and pipe it in 

106 if self.source.isString() or self.source.isFileObj(): 

107 yield '-' 

108 else: 

109 if isinstance(self.source.source, basestring): 

110 yield self.source.to_s() 

111 else: 

112 for s in self.source.source: 

113 yield s 

114 

115 # If output_path evaluates to False append '-' to end of args 

116 # and wkhtmltopdf will pass generated PDF to stdout 

117 if path: 

118 yield path 

119 else: 

120 yield '-' 

121 

122 def command(self, path=None): 

123 return list(self._command(path)) 

124 

125 def to_pdf(self, path=None): 

126 args = self.command(path) 

127 

128 result = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, 

129 stderr=subprocess.PIPE) 

130 

131 # If the source is a string then we will pipe it into wkhtmltopdf. 

132 # If we want to add custom CSS to file then we read input file to 

133 # string and prepend css to it and then pass it to stdin. 

134 # This is a workaround for a bug in wkhtmltopdf (look closely in README) 

135 if self.source.isString() or (self.source.isFile() and self.css): 

136 input = self.source.to_s().encode('utf-8') 

137 elif self.source.isFileObj(): 

138 input = self.source.source.read().encode('utf-8') 

139 else: 

140 input = None 

141 stdout, stderr = result.communicate(input=input) 

142 stderr = stderr or stdout 

143 try: 

144 stderr = stderr.decode('utf-8') 

145 except UnicodeDecodeError: 

146 stderr = '' 

147 exit_code = result.returncode 

148 

149 if 'cannot connect to X server' in stderr: 

150 raise IOError('%s\n' 

151 'You will need to run wkhtmltopdf within a "virtual" X server.\n' 

152 'Go to the link below for more information\n' 

153 'https://github.com/JazzCore/python-pdfkit/wiki/Using-wkhtmltopdf-without-X-server' % stderr) 

154 

155 if 'Error' in stderr: 

156 raise IOError('wkhtmltopdf reported an error:\n' + stderr) 

157 

158 if exit_code != 0: 

159 raise IOError("wkhtmltopdf exited with non-zero code {0}. error:\n{1}".format(exit_code, stderr)) 

160 

161 # Since wkhtmltopdf sends its output to stderr we will capture it 

162 # and properly send to stdout 

163 if '--quiet' not in args: 

164 sys.stdout.write(stderr) 

165 

166 if not path: 

167 return stdout 

168 else: 

169 try: 

170 with codecs.open(path, encoding='utf-8') as f: 

171 # read 4 bytes to get PDF signature '%PDF' 

172 text = f.read(4) 

173 if text == '': 

174 raise IOError('Command failed: %s\n' 

175 'Check whhtmltopdf output without \'quiet\' ' 

176 'option' % ' '.join(args)) 

177 return True 

178 except IOError as e: 

179 raise IOError('Command failed: %s\n' 

180 'Check whhtmltopdf output without \'quiet\' option\n' 

181 '%s ' %(' '.join(args)),e) 

182 

183 def _normalize_options(self, options): 

184 """ Generator of 2-tuples (option-key, option-value). 

185 When options spec is a list, generate a 2-tuples per list item. 

186 

187 :param options: dict {option name: value} 

188 

189 returns: 

190 iterator (option-key, option-value) 

191 - option names lower cased and prepended with 

192 '--' if necessary. Non-empty values cast to str 

193 """ 

194 

195 for key, value in list(options.items()): 

196 if not '--' in key: 

197 normalized_key = '--%s' % self._normalize_arg(key) 

198 else: 

199 normalized_key = self._normalize_arg(key) 

200 

201 if isinstance(value, (list, tuple)): 

202 for optval in value: 

203 yield (normalized_key, optval) 

204 else: 

205 yield (normalized_key, str(value) if value else value) 

206 

207 

208 def _normalize_arg(self, arg): 

209 return arg.lower() 

210 

211 def _style_tag_for(self, stylesheet): 

212 return "<style>%s</style>" % stylesheet 

213 

214 def _prepend_css(self, path): 

215 if self.source.isUrl() or isinstance(self.source.source, list): 

216 raise self.ImproperSourceError('CSS files can be added only to a single ' 

217 'file or string') 

218 

219 if not isinstance(path, list): 

220 path = [path] 

221 

222 css_data = [] 

223 for p in path: 

224 with codecs.open(p, encoding="UTF-8") as f: 

225 css_data.append(f.read()) 

226 css_data = "\n".join(css_data) 

227 

228 if self.source.isFile(): 

229 with codecs.open(self.source.to_s(), encoding="UTF-8") as f: 

230 inp = f.read() 

231 self.source = Source( 

232 inp.replace('</head>', self._style_tag_for(css_data) + '</head>'), 

233 'string') 

234 

235 elif self.source.isString(): 

236 if '</head>' in self.source.to_s(): 

237 self.source.source = self.source.to_s().replace( 

238 '</head>', self._style_tag_for(css_data) + '</head>') 

239 else: 

240 self.source.source = self._style_tag_for(css_data) + self.source.to_s() 

241 

242 def _find_options_in_meta(self, content): 

243 """Reads 'content' and extracts options encoded in HTML meta tags 

244 

245 :param content: str or file-like object - contains HTML to parse 

246 

247 returns: 

248 dict: {config option: value} 

249 """ 

250 if (isinstance(content, io.IOBase) 

251 or content.__class__.__name__ == 'StreamReaderWriter'): 

252 content = content.read() 

253 

254 found = {} 

255 

256 for x in re.findall('<meta [^>]*>', content): 

257 if re.search('name=["\']%s' % self.configuration.meta_tag_prefix, x): 

258 name = re.findall('name=["\']%s([^"\']*)' % 

259 self.configuration.meta_tag_prefix, x)[0] 

260 found[name] = re.findall('content=["\']([^"\']*)', x)[0] 

261 

262 return found