Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

1001

1002

1003

1004

1005

1006

1007

1008

1009

1010

1011

1012

1013

1014

1015

1016

1017

1018

1019

1020

1021

1022

1023

1024

1025

1026

1027

1028

1029

1030

1031

1032

1033

1034

1035

1036

1037

1038

1039

1040

1041

1042

1043

1044

1045

1046

1047

1048

1049

1050

1051

1052

1053

1054

1055

1056

1057

1058

1059

1060

1061

1062

1063

1064

1065

1066

1067

1068

1069

1070

1071

1072

1073

1074

1075

1076

1077

1078

1079

1080

1081

1082

1083

1084

1085

1086

1087

1088

1089

1090

1091

1092

1093

1094

1095

1096

1097

1098

1099

1100

1101

1102

1103

1104

1105

1106

1107

1108

1109

1110

1111

1112

1113

1114

1115

1116

1117

1118

1119

1120

1121

1122

1123

1124

1125

1126

1127

1128

1129

1130

1131

1132

1133

1134

1135

1136

1137

1138

1139

1140

1141

1142

1143

1144

1145

1146

1147

1148

1149

1150

1151

1152

1153

1154

1155

1156

1157

1158

1159

1160

1161

1162

1163

1164

1165

1166

1167

1168

1169

1170

1171

1172

1173

1174

1175

1176

1177

1178

1179

1180

1181

1182

1183

1184

1185

1186

1187

1188

1189

1190

1191

1192

1193

1194

1195

1196

1197

1198

1199

1200

1201

1202

1203

1204

1205

1206

1207

1208

1209

1210

1211

1212

1213

1214

1215

1216

1217

1218

1219

1220

1221

1222

1223

1224

1225

1226

1227

1228

1229

1230

1231

1232

1233

1234

1235

1236

1237

1238

1239

1240

1241

1242

1243

1244

1245

1246

1247

1248

1249

1250

1251

1252

1253

1254

1255

1256

1257

1258

1259

1260

1261

1262

1263

1264

1265

1266

1267

1268

1269

1270

1271

1272

1273

1274

1275

1276

1277

1278

1279

1280

1281

1282

1283

1284

1285

1286

1287

1288

1289

1290

1291

1292

1293

1294

1295

1296

1297

1298

1299

1300

1301

1302

1303

1304

1305

1306

1307

1308

1309

1310

1311

1312

1313

1314

1315

1316

1317

1318

1319

1320

1321

1322

1323

1324

1325

1326

1327

1328

1329

1330

1331

1332

1333

1334

1335

1336

1337

1338

1339

1340

1341

1342

1343

1344

1345

1346

1347

1348

1349

1350

1351

1352

1353

1354

1355

1356

1357

1358

1359

1360

1361

1362

1363

1364

1365

1366

1367

1368

1369

1370

1371

1372

1373

1374

1375

1376

1377

1378

1379

1380

1381

1382

1383

1384

1385

1386

1387

1388

1389

1390

1391

1392

1393

1394

1395

1396

1397

1398

1399

1400

1401

1402

1403

1404

1405

1406

1407

1408

1409

1410

1411

1412

1413

1414

1415

1416

1417

1418

1419

1420

1421

1422

1423

1424

1425

1426

1427

1428

1429

1430

1431

1432

1433

1434

1435

1436

1437

1438

1439

1440

1441

1442

1443

1444

1445

1446

1447

1448

1449

1450

1451

1452

1453

1454

1455

1456

1457

1458

1459

1460

1461

1462

1463

1464

1465

1466

1467

1468

1469

1470

"""An extensible library for opening URLs using a variety of protocols 

 

The simplest way to use this module is to call the urlopen function, 

which accepts a string containing a URL or a Request object (described 

below).  It opens the URL and returns the results as file-like 

object; the returned object has some extra methods described below. 

 

The OpenerDirector manages a collection of Handler objects that do 

all the actual work.  Each Handler implements a particular protocol or 

option.  The OpenerDirector is a composite object that invokes the 

Handlers needed to open the requested URL.  For example, the 

HTTPHandler performs HTTP GET and POST requests and deals with 

non-error returns.  The HTTPRedirectHandler automatically deals with 

HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler 

deals with digest authentication. 

 

urlopen(url, data=None) -- Basic usage is the same as original 

urllib.  pass the url and optionally data to post to an HTTP URL, and 

get a file-like object back.  One difference is that you can also pass 

a Request instance instead of URL.  Raises a URLError (subclass of 

IOError); for HTTP errors, raises an HTTPError, which can also be 

treated as a valid response. 

 

build_opener -- Function that creates a new OpenerDirector instance. 

Will install the default handlers.  Accepts one or more Handlers as 

arguments, either instances or Handler classes that it will 

instantiate.  If one of the argument is a subclass of the default 

handler, the argument will be installed instead of the default. 

 

install_opener -- Installs a new opener as the default opener. 

 

objects of interest: 

 

OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages 

the Handler classes, while dealing with requests and responses. 

 

Request -- An object that encapsulates the state of a request.  The 

state can be as simple as the URL.  It can also include extra HTTP 

headers, e.g. a User-Agent. 

 

BaseHandler -- 

 

exceptions: 

URLError -- A subclass of IOError, individual protocols have their own 

specific subclass. 

 

HTTPError -- Also a valid HTTP response, so you can treat an HTTP error 

as an exceptional event or valid response. 

 

internals: 

BaseHandler and parent 

_call_chain conventions 

 

Example usage: 

 

import urllib2 

 

# set up authentication info 

authinfo = urllib2.HTTPBasicAuthHandler() 

authinfo.add_password(realm='PDQ Application', 

                      uri='https://mahler:8092/site-updates.py', 

                      user='klem', 

                      passwd='geheim$parole') 

 

proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"}) 

 

# build a new opener that adds authentication and caching FTP handlers 

opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler) 

 

# install it 

urllib2.install_opener(opener) 

 

f = urllib2.urlopen('http://www.python.org/') 

 

 

""" 

 

# XXX issues: 

# If an authentication error handler that tries to perform 

# authentication for some reason but fails, how should the error be 

# signalled?  The client needs to know the HTTP error code.  But if 

# the handler knows that the problem was, e.g., that it didn't know 

# that hash algo that requested in the challenge, it would be good to 

# pass that information along to the client, too. 

# ftp errors aren't handled cleanly 

# check digest against correct (i.e. non-apache) implementation 

 

# Possible extensions: 

# complex proxies  XXX not sure what exactly was meant by this 

# abstract factory for opener 

 

import base64 

import hashlib 

import httplib 

import mimetools 

import os 

import posixpath 

import random 

import re 

import socket 

import sys 

import time 

import urlparse 

import bisect 

import warnings 

 

try: 

    from cStringIO import StringIO 

except ImportError: 

    from StringIO import StringIO 

 

from urllib import (unwrap, unquote, splittype, splithost, quote, 

     addinfourl, splitport, splittag, toBytes, 

     splitattr, ftpwrapper, splituser, splitpasswd, splitvalue) 

 

# support for FileHandler, proxies via environment variables 

from urllib import localhost, url2pathname, getproxies, proxy_bypass 

 

# used in User-Agent header sent 

__version__ = sys.version[:3] 

 

_opener = None 

def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 

    global _opener 

    if _opener is None: 

        _opener = build_opener() 

    return _opener.open(url, data, timeout) 

 

def install_opener(opener): 

    global _opener 

    _opener = opener 

 

# do these error classes make sense? 

# make sure all of the IOError stuff is overridden.  we just want to be 

# subtypes. 

 

class URLError(IOError): 

    # URLError is a sub-type of IOError, but it doesn't share any of 

    # the implementation.  need to override __init__ and __str__. 

    # It sets self.args for compatibility with other EnvironmentError 

    # subclasses, but args doesn't have the typical format with errno in 

    # slot 0 and strerror in slot 1.  This may be better than nothing. 

    def __init__(self, reason): 

        self.args = reason, 

        self.reason = reason 

 

    def __str__(self): 

        return '<urlopen error %s>' % self.reason 

 

class HTTPError(URLError, addinfourl): 

    """Raised when HTTP error occurs, but also acts like non-error return""" 

    __super_init = addinfourl.__init__ 

 

    def __init__(self, url, code, msg, hdrs, fp): 

        self.code = code 

        self.msg = msg 

        self.hdrs = hdrs 

        self.fp = fp 

        self.filename = url 

        # The addinfourl classes depend on fp being a valid file 

        # object.  In some cases, the HTTPError may not have a valid 

        # file object.  If this happens, the simplest workaround is to 

        # not initialize the base classes. 

        if fp is not None: 

            self.__super_init(fp, hdrs, url, code) 

 

    def __str__(self): 

        return 'HTTP Error %s: %s' % (self.code, self.msg) 

 

    # since URLError specifies a .reason attribute, HTTPError should also 

    #  provide this attribute. See issue13211 fo discussion. 

    @property 

    def reason(self): 

        return self.msg 

 

    def info(self): 

        return self.hdrs 

 

# copied from cookielib.py 

_cut_port_re = re.compile(r":\d+$") 

def request_host(request): 

    """Return request-host, as defined by RFC 2965. 

 

    Variation from RFC: returned value is lowercased, for convenient 

    comparison. 

 

    """ 

    url = request.get_full_url() 

    host = urlparse.urlparse(url)[1] 

    if host == "": 

        host = request.get_header("Host", "") 

 

    # remove port, if present 

    host = _cut_port_re.sub("", host, 1) 

    return host.lower() 

 

class Request: 

 

    def __init__(self, url, data=None, headers={}, 

                 origin_req_host=None, unverifiable=False): 

        # unwrap('<URL:type://host/path>') --> 'type://host/path' 

        self.__original = unwrap(url) 

        self.__original, self.__fragment = splittag(self.__original) 

        self.type = None 

        # self.__r_type is what's left after doing the splittype 

        self.host = None 

        self.port = None 

        self._tunnel_host = None 

        self.data = data 

        self.headers = {} 

        for key, value in headers.items(): 

            self.add_header(key, value) 

        self.unredirected_hdrs = {} 

        if origin_req_host is None: 

            origin_req_host = request_host(self) 

        self.origin_req_host = origin_req_host 

        self.unverifiable = unverifiable 

 

    def __getattr__(self, attr): 

        # XXX this is a fallback mechanism to guard against these 

        # methods getting called in a non-standard order.  this may be 

        # too complicated and/or unnecessary. 

        # XXX should the __r_XXX attributes be public? 

        if attr[:12] == '_Request__r_': 

            name = attr[12:] 

            if hasattr(Request, 'get_' + name): 

                getattr(self, 'get_' + name)() 

                return getattr(self, attr) 

        raise AttributeError, attr 

 

    def get_method(self): 

        if self.has_data(): 

            return "POST" 

        else: 

            return "GET" 

 

    # XXX these helper methods are lame 

 

    def add_data(self, data): 

        self.data = data 

 

    def has_data(self): 

        return self.data is not None 

 

    def get_data(self): 

        return self.data 

 

    def get_full_url(self): 

        if self.__fragment: 

            return '%s#%s' % (self.__original, self.__fragment) 

        else: 

            return self.__original 

 

    def get_type(self): 

        if self.type is None: 

            self.type, self.__r_type = splittype(self.__original) 

            if self.type is None: 

                raise ValueError, "unknown url type: %s" % self.__original 

        return self.type 

 

    def get_host(self): 

        if self.host is None: 

            self.host, self.__r_host = splithost(self.__r_type) 

            if self.host: 

                self.host = unquote(self.host) 

        return self.host 

 

    def get_selector(self): 

        return self.__r_host 

 

    def set_proxy(self, host, type): 

        if self.type == 'https' and not self._tunnel_host: 

            self._tunnel_host = self.host 

        else: 

            self.type = type 

            self.__r_host = self.__original 

 

        self.host = host 

 

    def has_proxy(self): 

        return self.__r_host == self.__original 

 

    def get_origin_req_host(self): 

        return self.origin_req_host 

 

    def is_unverifiable(self): 

        return self.unverifiable 

 

    def add_header(self, key, val): 

        # useful for something like authentication 

        self.headers[key.capitalize()] = val 

 

    def add_unredirected_header(self, key, val): 

        # will not be added to a redirected request 

        self.unredirected_hdrs[key.capitalize()] = val 

 

    def has_header(self, header_name): 

        return (header_name in self.headers or 

                header_name in self.unredirected_hdrs) 

 

    def get_header(self, header_name, default=None): 

        return self.headers.get( 

            header_name, 

            self.unredirected_hdrs.get(header_name, default)) 

 

    def header_items(self): 

        hdrs = self.unredirected_hdrs.copy() 

        hdrs.update(self.headers) 

        return hdrs.items() 

 

class OpenerDirector: 

    def __init__(self): 

        client_version = "Python-urllib/%s" % __version__ 

        self.addheaders = [('User-agent', client_version)] 

        # self.handlers is retained only for backward compatibility 

        self.handlers = [] 

        # manage the individual handlers 

        self.handle_open = {} 

        self.handle_error = {} 

        self.process_response = {} 

        self.process_request = {} 

 

    def add_handler(self, handler): 

        if not hasattr(handler, "add_parent"): 

            raise TypeError("expected BaseHandler instance, got %r" % 

                            type(handler)) 

 

        added = False 

        for meth in dir(handler): 

            if meth in ["redirect_request", "do_open", "proxy_open"]: 

                # oops, coincidental match 

                continue 

 

            i = meth.find("_") 

            protocol = meth[:i] 

            condition = meth[i+1:] 

 

            if condition.startswith("error"): 

                j = condition.find("_") + i + 1 

                kind = meth[j+1:] 

                try: 

                    kind = int(kind) 

                except ValueError: 

                    pass 

                lookup = self.handle_error.get(protocol, {}) 

                self.handle_error[protocol] = lookup 

            elif condition == "open": 

                kind = protocol 

                lookup = self.handle_open 

            elif condition == "response": 

                kind = protocol 

                lookup = self.process_response 

            elif condition == "request": 

                kind = protocol 

                lookup = self.process_request 

            else: 

                continue 

 

            handlers = lookup.setdefault(kind, []) 

            if handlers: 

                bisect.insort(handlers, handler) 

            else: 

                handlers.append(handler) 

            added = True 

 

        if added: 

            bisect.insort(self.handlers, handler) 

            handler.add_parent(self) 

 

    def close(self): 

        # Only exists for backwards compatibility. 

        pass 

 

    def _call_chain(self, chain, kind, meth_name, *args): 

        # Handlers raise an exception if no one else should try to handle 

        # the request, or return None if they can't but another handler 

        # could.  Otherwise, they return the response. 

        handlers = chain.get(kind, ()) 

        for handler in handlers: 

            func = getattr(handler, meth_name) 

 

            result = func(*args) 

            if result is not None: 

                return result 

 

    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 

        # accept a URL or a Request object 

        if isinstance(fullurl, basestring): 

            req = Request(fullurl, data) 

        else: 

            req = fullurl 

            if data is not None: 

                req.add_data(data) 

 

        req.timeout = timeout 

        protocol = req.get_type() 

 

        # pre-process request 

        meth_name = protocol+"_request" 

        for processor in self.process_request.get(protocol, []): 

            meth = getattr(processor, meth_name) 

            req = meth(req) 

 

        response = self._open(req, data) 

 

        # post-process response 

        meth_name = protocol+"_response" 

        for processor in self.process_response.get(protocol, []): 

            meth = getattr(processor, meth_name) 

            response = meth(req, response) 

 

        return response 

 

    def _open(self, req, data=None): 

        result = self._call_chain(self.handle_open, 'default', 

                                  'default_open', req) 

        if result: 

            return result 

 

        protocol = req.get_type() 

        result = self._call_chain(self.handle_open, protocol, protocol + 

                                  '_open', req) 

        if result: 

            return result 

 

        return self._call_chain(self.handle_open, 'unknown', 

                                'unknown_open', req) 

 

    def error(self, proto, *args): 

        if proto in ('http', 'https'): 

            # XXX http[s] protocols are special-cased 

            dict = self.handle_error['http'] # https is not different than http 

            proto = args[2]  # YUCK! 

            meth_name = 'http_error_%s' % proto 

            http_err = 1 

            orig_args = args 

        else: 

            dict = self.handle_error 

            meth_name = proto + '_error' 

            http_err = 0 

        args = (dict, proto, meth_name) + args 

        result = self._call_chain(*args) 

        if result: 

            return result 

 

        if http_err: 

            args = (dict, 'default', 'http_error_default') + orig_args 

            return self._call_chain(*args) 

 

# XXX probably also want an abstract factory that knows when it makes 

# sense to skip a superclass in favor of a subclass and when it might 

# make sense to include both 

 

def build_opener(*handlers): 

    """Create an opener object from a list of handlers. 

 

    The opener will use several default handlers, including support 

    for HTTP, FTP and when applicable, HTTPS. 

 

    If any of the handlers passed as arguments are subclasses of the 

    default handlers, the default handlers will not be used. 

    """ 

    import types 

    def isclass(obj): 

        return isinstance(obj, (types.ClassType, type)) 

 

    opener = OpenerDirector() 

    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 

                       HTTPDefaultErrorHandler, HTTPRedirectHandler, 

                       FTPHandler, FileHandler, HTTPErrorProcessor] 

    if hasattr(httplib, 'HTTPS'): 

        default_classes.append(HTTPSHandler) 

    skip = set() 

    for klass in default_classes: 

        for check in handlers: 

            if isclass(check): 

                if issubclass(check, klass): 

                    skip.add(klass) 

            elif isinstance(check, klass): 

                skip.add(klass) 

    for klass in skip: 

        default_classes.remove(klass) 

 

    for klass in default_classes: 

        opener.add_handler(klass()) 

 

    for h in handlers: 

        if isclass(h): 

            h = h() 

        opener.add_handler(h) 

    return opener 

 

class BaseHandler: 

    handler_order = 500 

 

    def add_parent(self, parent): 

        self.parent = parent 

 

    def close(self): 

        # Only exists for backwards compatibility 

        pass 

 

    def __lt__(self, other): 

        if not hasattr(other, "handler_order"): 

            # Try to preserve the old behavior of having custom classes 

            # inserted after default ones (works only for custom user 

            # classes which are not aware of handler_order). 

            return True 

        return self.handler_order < other.handler_order 

 

 

class HTTPErrorProcessor(BaseHandler): 

    """Process HTTP error responses.""" 

    handler_order = 1000  # after all other processing 

 

    def http_response(self, request, response): 

        code, msg, hdrs = response.code, response.msg, response.info() 

 

        # According to RFC 2616, "2xx" code indicates that the client's 

        # request was successfully received, understood, and accepted. 

        if not (200 <= code < 300): 

            response = self.parent.error( 

                'http', request, response, code, msg, hdrs) 

 

        return response 

 

    https_response = http_response 

 

class HTTPDefaultErrorHandler(BaseHandler): 

    def http_error_default(self, req, fp, code, msg, hdrs): 

        raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) 

 

class HTTPRedirectHandler(BaseHandler): 

    # maximum number of redirections to any single URL 

    # this is needed because of the state that cookies introduce 

    max_repeats = 4 

    # maximum total number of redirections (regardless of URL) before 

    # assuming we're in a loop 

    max_redirections = 10 

 

    def redirect_request(self, req, fp, code, msg, headers, newurl): 

        """Return a Request or None in response to a redirect. 

 

        This is called by the http_error_30x methods when a 

        redirection response is received.  If a redirection should 

        take place, return a new Request to allow http_error_30x to 

        perform the redirect.  Otherwise, raise HTTPError if no-one 

        else should try to handle this url.  Return None if you can't 

        but another Handler might. 

        """ 

        m = req.get_method() 

        if (code in (301, 302, 303, 307) and m in ("GET", "HEAD") 

            or code in (301, 302, 303) and m == "POST"): 

            # Strictly (according to RFC 2616), 301 or 302 in response 

            # to a POST MUST NOT cause a redirection without confirmation 

            # from the user (of urllib2, in this case).  In practice, 

            # essentially all clients do redirect in this case, so we 

            # do the same. 

            # be conciliant with URIs containing a space 

            newurl = newurl.replace(' ', '%20') 

            newheaders = dict((k,v) for k,v in req.headers.items() 

                              if k.lower() not in ("content-length", "content-type") 

                             ) 

            return Request(newurl, 

                           headers=newheaders, 

                           origin_req_host=req.get_origin_req_host(), 

                           unverifiable=True) 

        else: 

            raise HTTPError(req.get_full_url(), code, msg, headers, fp) 

 

    # Implementation note: To avoid the server sending us into an 

    # infinite loop, the request object needs to track what URLs we 

    # have already seen.  Do this by adding a handler-specific 

    # attribute to the Request object. 

    def http_error_302(self, req, fp, code, msg, headers): 

        # Some servers (incorrectly) return multiple Location headers 

        # (so probably same goes for URI).  Use first header. 

        if 'location' in headers: 

            newurl = headers.getheaders('location')[0] 

        elif 'uri' in headers: 

            newurl = headers.getheaders('uri')[0] 

        else: 

            return 

 

        # fix a possible malformed URL 

        urlparts = urlparse.urlparse(newurl) 

        if not urlparts.path: 

            urlparts = list(urlparts) 

            urlparts[2] = "/" 

        newurl = urlparse.urlunparse(urlparts) 

 

        newurl = urlparse.urljoin(req.get_full_url(), newurl) 

 

        # For security reasons we do not allow redirects to protocols 

        # other than HTTP, HTTPS or FTP. 

        newurl_lower = newurl.lower() 

        if not (newurl_lower.startswith('http://') or 

                newurl_lower.startswith('https://') or 

                newurl_lower.startswith('ftp://')): 

            raise HTTPError(newurl, code, 

                            msg + " - Redirection to url '%s' is not allowed" % 

                            newurl, 

                            headers, fp) 

 

        # XXX Probably want to forget about the state of the current 

        # request, although that might interact poorly with other 

        # handlers that also use handler-specific request attributes 

        new = self.redirect_request(req, fp, code, msg, headers, newurl) 

        if new is None: 

            return 

 

        # loop detection 

        # .redirect_dict has a key url if url was previously visited. 

        if hasattr(req, 'redirect_dict'): 

            visited = new.redirect_dict = req.redirect_dict 

            if (visited.get(newurl, 0) >= self.max_repeats or 

                len(visited) >= self.max_redirections): 

                raise HTTPError(req.get_full_url(), code, 

                                self.inf_msg + msg, headers, fp) 

        else: 

            visited = new.redirect_dict = req.redirect_dict = {} 

        visited[newurl] = visited.get(newurl, 0) + 1 

 

        # Don't close the fp until we are sure that we won't use it 

        # with HTTPError. 

        fp.read() 

        fp.close() 

 

        return self.parent.open(new, timeout=req.timeout) 

 

    http_error_301 = http_error_303 = http_error_307 = http_error_302 

 

    inf_msg = "The HTTP server returned a redirect error that would " \ 

              "lead to an infinite loop.\n" \ 

              "The last 30x error message was:\n" 

 

 

def _parse_proxy(proxy): 

    """Return (scheme, user, password, host/port) given a URL or an authority. 

 

    If a URL is supplied, it must have an authority (host:port) component. 

    According to RFC 3986, having an authority component means the URL must 

    have two slashes after the scheme: 

 

    >>> _parse_proxy('file:/ftp.example.com/') 

    Traceback (most recent call last): 

    ValueError: proxy URL with no authority: 'file:/ftp.example.com/' 

 

    The first three items of the returned tuple may be None. 

 

    Examples of authority parsing: 

 

    >>> _parse_proxy('proxy.example.com') 

    (None, None, None, 'proxy.example.com') 

    >>> _parse_proxy('proxy.example.com:3128') 

    (None, None, None, 'proxy.example.com:3128') 

 

    The authority component may optionally include userinfo (assumed to be 

    username:password): 

 

    >>> _parse_proxy('joe:password@proxy.example.com') 

    (None, 'joe', 'password', 'proxy.example.com') 

    >>> _parse_proxy('joe:password@proxy.example.com:3128') 

    (None, 'joe', 'password', 'proxy.example.com:3128') 

 

    Same examples, but with URLs instead: 

 

    >>> _parse_proxy('http://proxy.example.com/') 

    ('http', None, None, 'proxy.example.com') 

    >>> _parse_proxy('http://proxy.example.com:3128/') 

    ('http', None, None, 'proxy.example.com:3128') 

    >>> _parse_proxy('http://joe:password@proxy.example.com/') 

    ('http', 'joe', 'password', 'proxy.example.com') 

    >>> _parse_proxy('http://joe:password@proxy.example.com:3128') 

    ('http', 'joe', 'password', 'proxy.example.com:3128') 

 

    Everything after the authority is ignored: 

 

    >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') 

    ('ftp', 'joe', 'password', 'proxy.example.com') 

 

    Test for no trailing '/' case: 

 

    >>> _parse_proxy('http://joe:password@proxy.example.com') 

    ('http', 'joe', 'password', 'proxy.example.com') 

 

    """ 

    scheme, r_scheme = splittype(proxy) 

    if not r_scheme.startswith("/"): 

        # authority 

        scheme = None 

        authority = proxy 

    else: 

        # URL 

        if not r_scheme.startswith("//"): 

            raise ValueError("proxy URL with no authority: %r" % proxy) 

        # We have an authority, so for RFC 3986-compliant URLs (by ss 3. 

        # and 3.3.), path is empty or starts with '/' 

        end = r_scheme.find("/", 2) 

        if end == -1: 

            end = None 

        authority = r_scheme[2:end] 

    userinfo, hostport = splituser(authority) 

    if userinfo is not None: 

        user, password = splitpasswd(userinfo) 

    else: 

        user = password = None 

    return scheme, user, password, hostport 

 

class ProxyHandler(BaseHandler): 

    # Proxies must be in front 

    handler_order = 100 

 

    def __init__(self, proxies=None): 

        if proxies is None: 

            proxies = getproxies() 

        assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 

        self.proxies = proxies 

        for type, url in proxies.items(): 

            setattr(self, '%s_open' % type, 

                    lambda r, proxy=url, type=type, meth=self.proxy_open: \ 

                    meth(r, proxy, type)) 

 

    def proxy_open(self, req, proxy, type): 

        orig_type = req.get_type() 

        proxy_type, user, password, hostport = _parse_proxy(proxy) 

 

        if proxy_type is None: 

            proxy_type = orig_type 

 

        if req.host and proxy_bypass(req.host): 

            return None 

 

        if user and password: 

            user_pass = '%s:%s' % (unquote(user), unquote(password)) 

            creds = base64.b64encode(user_pass).strip() 

            req.add_header('Proxy-authorization', 'Basic ' + creds) 

        hostport = unquote(hostport) 

        req.set_proxy(hostport, proxy_type) 

 

        if orig_type == proxy_type or orig_type == 'https': 

            # let other handlers take care of it 

            return None 

        else: 

            # need to start over, because the other handlers don't 

            # grok the proxy's URL type 

            # e.g. if we have a constructor arg proxies like so: 

            # {'http': 'ftp://proxy.example.com'}, we may end up turning 

            # a request for http://acme.example.com/a into one for 

            # ftp://proxy.example.com/a 

            return self.parent.open(req, timeout=req.timeout) 

 

class HTTPPasswordMgr: 

 

    def __init__(self): 

        self.passwd = {} 

 

    def add_password(self, realm, uri, user, passwd): 

        # uri could be a single URI or a sequence 

        if isinstance(uri, basestring): 

            uri = [uri] 

        if not realm in self.passwd: 

            self.passwd[realm] = {} 

        for default_port in True, False: 

            reduced_uri = tuple( 

                [self.reduce_uri(u, default_port) for u in uri]) 

            self.passwd[realm][reduced_uri] = (user, passwd) 

 

    def find_user_password(self, realm, authuri): 

        domains = self.passwd.get(realm, {}) 

        for default_port in True, False: 

            reduced_authuri = self.reduce_uri(authuri, default_port) 

            for uris, authinfo in domains.iteritems(): 

                for uri in uris: 

                    if self.is_suburi(uri, reduced_authuri): 

                        return authinfo 

        return None, None 

 

    def reduce_uri(self, uri, default_port=True): 

        """Accept authority or URI and extract only the authority and path.""" 

        # note HTTP URLs do not have a userinfo component 

        parts = urlparse.urlsplit(uri) 

        if parts[1]: 

            # URI 

            scheme = parts[0] 

            authority = parts[1] 

            path = parts[2] or '/' 

        else: 

            # host or host:port 

            scheme = None 

            authority = uri 

            path = '/' 

        host, port = splitport(authority) 

        if default_port and port is None and scheme is not None: 

            dport = {"http": 80, 

                     "https": 443, 

                     }.get(scheme) 

            if dport is not None: 

                authority = "%s:%d" % (host, dport) 

        return authority, path 

 

    def is_suburi(self, base, test): 

        """Check if test is below base in a URI tree 

 

        Both args must be URIs in reduced form. 

        """ 

        if base == test: 

            return True 

        if base[0] != test[0]: 

            return False 

        common = posixpath.commonprefix((base[1], test[1])) 

        if len(common) == len(base[1]): 

            return True 

        return False 

 

 

class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 

 

    def find_user_password(self, realm, authuri): 

        user, password = HTTPPasswordMgr.find_user_password(self, realm, 

                                                            authuri) 

        if user is not None: 

            return user, password 

        return HTTPPasswordMgr.find_user_password(self, None, authuri) 

 

 

class AbstractBasicAuthHandler: 

 

    # XXX this allows for multiple auth-schemes, but will stupidly pick 

    # the last one with a realm specified. 

 

    # allow for double- and single-quoted realm values 

    # (single quotes are a violation of the RFC, but appear in the wild) 

    rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' 

                    'realm=(["\']?)([^"\']*)\\2', re.I) 

 

    # XXX could pre-emptively send auth info already accepted (RFC 2617, 

    # end of section 2, and section 1.2 immediately after "credentials" 

    # production). 

 

    def __init__(self, password_mgr=None): 

        if password_mgr is None: 

            password_mgr = HTTPPasswordMgr() 

        self.passwd = password_mgr 

        self.add_password = self.passwd.add_password 

        self.retried = 0 

 

    def reset_retry_count(self): 

        self.retried = 0 

 

    def http_error_auth_reqed(self, authreq, host, req, headers): 

        # host may be an authority (without userinfo) or a URL with an 

        # authority 

        # XXX could be multiple headers 

        authreq = headers.get(authreq, None) 

 

        if self.retried > 5: 

            # retry sending the username:password 5 times before failing. 

            raise HTTPError(req.get_full_url(), 401, "basic auth failed", 

                            headers, None) 

        else: 

            self.retried += 1 

 

        if authreq: 

            mo = AbstractBasicAuthHandler.rx.search(authreq) 

            if mo: 

                scheme, quote, realm = mo.groups() 

                if quote not in ['"', "'"]: 

                    warnings.warn("Basic Auth Realm was unquoted", 

                                  UserWarning, 2) 

                if scheme.lower() == 'basic': 

                    response = self.retry_http_basic_auth(host, req, realm) 

                    if response and response.code != 401: 

                        self.retried = 0 

                    return response 

 

    def retry_http_basic_auth(self, host, req, realm): 

        user, pw = self.passwd.find_user_password(realm, host) 

        if pw is not None: 

            raw = "%s:%s" % (user, pw) 

            auth = 'Basic %s' % base64.b64encode(raw).strip() 

            if req.headers.get(self.auth_header, None) == auth: 

                return None 

            req.add_unredirected_header(self.auth_header, auth) 

            return self.parent.open(req, timeout=req.timeout) 

        else: 

            return None 

 

 

class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 

 

    auth_header = 'Authorization' 

 

    def http_error_401(self, req, fp, code, msg, headers): 

        url = req.get_full_url() 

        response = self.http_error_auth_reqed('www-authenticate', 

                                              url, req, headers) 

        self.reset_retry_count() 

        return response 

 

 

class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 

 

    auth_header = 'Proxy-authorization' 

 

    def http_error_407(self, req, fp, code, msg, headers): 

        # http_error_auth_reqed requires that there is no userinfo component in 

        # authority.  Assume there isn't one, since urllib2 does not (and 

        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing 

        # userinfo. 

        authority = req.get_host() 

        response = self.http_error_auth_reqed('proxy-authenticate', 

                                          authority, req, headers) 

        self.reset_retry_count() 

        return response 

 

 

def randombytes(n): 

    """Return n random bytes.""" 

    # Use /dev/urandom if it is available.  Fall back to random module 

    # if not.  It might be worthwhile to extend this function to use 

    # other platform-specific mechanisms for getting random bytes. 

    if os.path.exists("/dev/urandom"): 

        f = open("/dev/urandom") 

        s = f.read(n) 

        f.close() 

        return s 

    else: 

        L = [chr(random.randrange(0, 256)) for i in range(n)] 

        return "".join(L) 

 

class AbstractDigestAuthHandler: 

    # Digest authentication is specified in RFC 2617. 

 

    # XXX The client does not inspect the Authentication-Info header 

    # in a successful response. 

 

    # XXX It should be possible to test this implementation against 

    # a mock server that just generates a static set of challenges. 

 

    # XXX qop="auth-int" supports is shaky 

 

    def __init__(self, passwd=None): 

        if passwd is None: 

            passwd = HTTPPasswordMgr() 

        self.passwd = passwd 

        self.add_password = self.passwd.add_password 

        self.retried = 0 

        self.nonce_count = 0 

        self.last_nonce = None 

 

    def reset_retry_count(self): 

        self.retried = 0 

 

    def http_error_auth_reqed(self, auth_header, host, req, headers): 

        authreq = headers.get(auth_header, None) 

        if self.retried > 5: 

            # Don't fail endlessly - if we failed once, we'll probably 

            # fail a second time. Hm. Unless the Password Manager is 

            # prompting for the information. Crap. This isn't great 

            # but it's better than the current 'repeat until recursion 

            # depth exceeded' approach <wink> 

            raise HTTPError(req.get_full_url(), 401, "digest auth failed", 

                            headers, None) 

        else: 

            self.retried += 1 

        if authreq: 

            scheme = authreq.split()[0] 

            if scheme.lower() == 'digest': 

                return self.retry_http_digest_auth(req, authreq) 

 

    def retry_http_digest_auth(self, req, auth): 

        token, challenge = auth.split(' ', 1) 

        chal = parse_keqv_list(parse_http_list(challenge)) 

        auth = self.get_authorization(req, chal) 

        if auth: 

            auth_val = 'Digest %s' % auth 

            if req.headers.get(self.auth_header, None) == auth_val: 

                return None 

            req.add_unredirected_header(self.auth_header, auth_val) 

            resp = self.parent.open(req, timeout=req.timeout) 

            return resp 

 

    def get_cnonce(self, nonce): 

        # The cnonce-value is an opaque 

        # quoted string value provided by the client and used by both client 

        # and server to avoid chosen plaintext attacks, to provide mutual 

        # authentication, and to provide some message integrity protection. 

        # This isn't a fabulous effort, but it's probably Good Enough. 

        dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(), 

                                            randombytes(8))).hexdigest() 

        return dig[:16] 

 

    def get_authorization(self, req, chal): 

        try: 

            realm = chal['realm'] 

            nonce = chal['nonce'] 

            qop = chal.get('qop') 

            algorithm = chal.get('algorithm', 'MD5') 

            # mod_digest doesn't send an opaque, even though it isn't 

            # supposed to be optional 

            opaque = chal.get('opaque', None) 

        except KeyError: 

            return None 

 

        H, KD = self.get_algorithm_impls(algorithm) 

        if H is None: 

            return None 

 

        user, pw = self.passwd.find_user_password(realm, req.get_full_url()) 

        if user is None: 

            return None 

 

        # XXX not implemented yet 

        if req.has_data(): 

            entdig = self.get_entity_digest(req.get_data(), chal) 

        else: 

            entdig = None 

 

        A1 = "%s:%s:%s" % (user, realm, pw) 

        A2 = "%s:%s" % (req.get_method(), 

                        # XXX selector: what about proxies and full urls 

                        req.get_selector()) 

        if qop == 'auth': 

            if nonce == self.last_nonce: 

                self.nonce_count += 1 

            else: 

                self.nonce_count = 1 

                self.last_nonce = nonce 

 

            ncvalue = '%08x' % self.nonce_count 

            cnonce = self.get_cnonce(nonce) 

            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) 

            respdig = KD(H(A1), noncebit) 

        elif qop is None: 

            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 

        else: 

            # XXX handle auth-int. 

            raise URLError("qop '%s' is not supported." % qop) 

 

        # XXX should the partial digests be encoded too? 

 

        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 

               'response="%s"' % (user, realm, nonce, req.get_selector(), 

                                  respdig) 

        if opaque: 

            base += ', opaque="%s"' % opaque 

        if entdig: 

            base += ', digest="%s"' % entdig 

        base += ', algorithm="%s"' % algorithm 

        if qop: 

            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 

        return base 

 

    def get_algorithm_impls(self, algorithm): 

        # algorithm should be case-insensitive according to RFC2617 

        algorithm = algorithm.upper() 

        # lambdas assume digest modules are imported at the top level 

        if algorithm == 'MD5': 

            H = lambda x: hashlib.md5(x).hexdigest() 

        elif algorithm == 'SHA': 

            H = lambda x: hashlib.sha1(x).hexdigest() 

        # XXX MD5-sess 

        KD = lambda s, d: H("%s:%s" % (s, d)) 

        return H, KD 

 

    def get_entity_digest(self, data, chal): 

        # XXX not implemented yet 

        return None 

 

 

class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 

    """An authentication protocol defined by RFC 2069 

 

    Digest authentication improves on basic authentication because it 

    does not transmit passwords in the clear. 

    """ 

 

    auth_header = 'Authorization' 

    handler_order = 490  # before Basic auth 

 

    def http_error_401(self, req, fp, code, msg, headers): 

        host = urlparse.urlparse(req.get_full_url())[1] 

        retry = self.http_error_auth_reqed('www-authenticate', 

                                           host, req, headers) 

        self.reset_retry_count() 

        return retry 

 

 

class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 

 

    auth_header = 'Proxy-Authorization' 

    handler_order = 490  # before Basic auth 

 

    def http_error_407(self, req, fp, code, msg, headers): 

        host = req.get_host() 

        retry = self.http_error_auth_reqed('proxy-authenticate', 

                                           host, req, headers) 

        self.reset_retry_count() 

        return retry 

 

class AbstractHTTPHandler(BaseHandler): 

 

    def __init__(self, debuglevel=0): 

        self._debuglevel = debuglevel 

 

    def set_http_debuglevel(self, level): 

        self._debuglevel = level 

 

    def do_request_(self, request): 

        host = request.get_host() 

        if not host: 

            raise URLError('no host given') 

 

        if request.has_data():  # POST 

            data = request.get_data() 

            if not request.has_header('Content-type'): 

                request.add_unredirected_header( 

                    'Content-type', 

                    'application/x-www-form-urlencoded') 

            if not request.has_header('Content-length'): 

                request.add_unredirected_header( 

                    'Content-length', '%d' % len(data)) 

 

        sel_host = host 

        if request.has_proxy(): 

            scheme, sel = splittype(request.get_selector()) 

            sel_host, sel_path = splithost(sel) 

 

        if not request.has_header('Host'): 

            request.add_unredirected_header('Host', sel_host) 

        for name, value in self.parent.addheaders: 

            name = name.capitalize() 

            if not request.has_header(name): 

                request.add_unredirected_header(name, value) 

 

        return request 

 

    def do_open(self, http_class, req): 

        """Return an addinfourl object for the request, using http_class. 

 

        http_class must implement the HTTPConnection API from httplib. 

        The addinfourl return value is a file-like object.  It also 

        has methods and attributes including: 

            - info(): return a mimetools.Message object for the headers 

            - geturl(): return the original request URL 

            - code: HTTP status code 

        """ 

        host = req.get_host() 

        if not host: 

            raise URLError('no host given') 

 

        h = http_class(host, timeout=req.timeout) # will parse host:port 

        h.set_debuglevel(self._debuglevel) 

 

        headers = dict(req.unredirected_hdrs) 

        headers.update(dict((k, v) for k, v in req.headers.items() 

                            if k not in headers)) 

 

        # We want to make an HTTP/1.1 request, but the addinfourl 

        # class isn't prepared to deal with a persistent connection. 

        # It will try to read all remaining data from the socket, 

        # which will block while the server waits for the next request. 

        # So make sure the connection gets closed after the (only) 

        # request. 

        headers["Connection"] = "close" 

        headers = dict( 

            (name.title(), val) for name, val in headers.items()) 

 

        if req._tunnel_host: 

            tunnel_headers = {} 

            proxy_auth_hdr = "Proxy-Authorization" 

            if proxy_auth_hdr in headers: 

                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 

                # Proxy-Authorization should not be sent to origin 

                # server. 

                del headers[proxy_auth_hdr] 

            h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 

 

        try: 

            h.request(req.get_method(), req.get_selector(), req.data, headers) 

        except socket.error, err: # XXX what error? 

            h.close() 

            raise URLError(err) 

        else: 

            try: 

                r = h.getresponse(buffering=True) 

            except TypeError: # buffering kw not supported 

                r = h.getresponse() 

 

        # Pick apart the HTTPResponse object to get the addinfourl 

        # object initialized properly. 

 

        # Wrap the HTTPResponse object in socket's file object adapter 

        # for Windows.  That adapter calls recv(), so delegate recv() 

        # to read().  This weird wrapping allows the returned object to 

        # have readline() and readlines() methods. 

 

        # XXX It might be better to extract the read buffering code 

        # out of socket._fileobject() and into a base class. 

 

        r.recv = r.read 

        fp = socket._fileobject(r, close=True) 

 

        resp = addinfourl(fp, r.msg, req.get_full_url()) 

        resp.code = r.status 

        resp.msg = r.reason 

        return resp 

 

 

class HTTPHandler(AbstractHTTPHandler): 

 

    def http_open(self, req): 

        return self.do_open(httplib.HTTPConnection, req) 

 

    http_request = AbstractHTTPHandler.do_request_ 

 

if hasattr(httplib, 'HTTPS'): 

    class HTTPSHandler(AbstractHTTPHandler): 

 

        def https_open(self, req): 

            return self.do_open(httplib.HTTPSConnection, req) 

 

        https_request = AbstractHTTPHandler.do_request_ 

 

class HTTPCookieProcessor(BaseHandler): 

    def __init__(self, cookiejar=None): 

        import cookielib 

        if cookiejar is None: 

            cookiejar = cookielib.CookieJar() 

        self.cookiejar = cookiejar 

 

    def http_request(self, request): 

        self.cookiejar.add_cookie_header(request) 

        return request 

 

    def http_response(self, request, response): 

        self.cookiejar.extract_cookies(response, request) 

        return response 

 

    https_request = http_request 

    https_response = http_response 

 

class UnknownHandler(BaseHandler): 

    def unknown_open(self, req): 

        type = req.get_type() 

        raise URLError('unknown url type: %s' % type) 

 

def parse_keqv_list(l): 

    """Parse list of key=value strings where keys are not duplicated.""" 

    parsed = {} 

    for elt in l: 

        k, v = elt.split('=', 1) 

        if v[0] == '"' and v[-1] == '"': 

            v = v[1:-1] 

        parsed[k] = v 

    return parsed 

 

def parse_http_list(s): 

    """Parse lists as described by RFC 2068 Section 2. 

 

    In particular, parse comma-separated lists where the elements of 

    the list may include quoted-strings.  A quoted-string could 

    contain a comma.  A non-quoted string could have quotes in the 

    middle.  Neither commas nor quotes count if they are escaped. 

    Only double-quotes count, not single-quotes. 

    """ 

    res = [] 

    part = '' 

 

    escape = quote = False 

    for cur in s: 

        if escape: 

            part += cur 

            escape = False 

            continue 

        if quote: 

            if cur == '\\': 

                escape = True 

                continue 

            elif cur == '"': 

                quote = False 

            part += cur 

            continue 

 

        if cur == ',': 

            res.append(part) 

            part = '' 

            continue 

 

        if cur == '"': 

            quote = True 

 

        part += cur 

 

    # append last part 

    if part: 

        res.append(part) 

 

    return [part.strip() for part in res] 

 

def _safe_gethostbyname(host): 

    try: 

        return socket.gethostbyname(host) 

    except socket.gaierror: 

        return None 

 

class FileHandler(BaseHandler): 

    # Use local file or FTP depending on form of URL 

    def file_open(self, req): 

        url = req.get_selector() 

        if url[:2] == '//' and url[2:3] != '/' and (req.host and 

                req.host != 'localhost'): 

            req.type = 'ftp' 

            return self.parent.open(req) 

        else: 

            return self.open_local_file(req) 

 

    # names for the localhost 

    names = None 

    def get_names(self): 

        if FileHandler.names is None: 

            try: 

                FileHandler.names = tuple( 

                    socket.gethostbyname_ex('localhost')[2] + 

                    socket.gethostbyname_ex(socket.gethostname())[2]) 

            except socket.gaierror: 

                FileHandler.names = (socket.gethostbyname('localhost'),) 

        return FileHandler.names 

 

    # not entirely sure what the rules are here 

    def open_local_file(self, req): 

        import email.utils 

        import mimetypes 

        host = req.get_host() 

        filename = req.get_selector() 

        localfile = url2pathname(filename) 

        try: 

            stats = os.stat(localfile) 

            size = stats.st_size 

            modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 

            mtype = mimetypes.guess_type(filename)[0] 

            headers = mimetools.Message(StringIO( 

                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 

                (mtype or 'text/plain', size, modified))) 

            if host: 

                host, port = splitport(host) 

            if not host or \ 

                (not port and _safe_gethostbyname(host) in self.get_names()): 

                if host: 

                    origurl = 'file://' + host + filename 

                else: 

                    origurl = 'file://' + filename 

                return addinfourl(open(localfile, 'rb'), headers, origurl) 

        except OSError, msg: 

            # urllib2 users shouldn't expect OSErrors coming from urlopen() 

            raise URLError(msg) 

        raise URLError('file not on local host') 

 

class FTPHandler(BaseHandler): 

    def ftp_open(self, req): 

        import ftplib 

        import mimetypes 

        host = req.get_host() 

        if not host: 

            raise URLError('ftp error: no host given') 

        host, port = splitport(host) 

        if port is None: 

            port = ftplib.FTP_PORT 

        else: 

            port = int(port) 

 

        # username/password handling 

        user, host = splituser(host) 

        if user: 

            user, passwd = splitpasswd(user) 

        else: 

            passwd = None 

        host = unquote(host) 

        user = user or '' 

        passwd = passwd or '' 

 

        try: 

            host = socket.gethostbyname(host) 

        except socket.error, msg: 

            raise URLError(msg) 

        path, attrs = splitattr(req.get_selector()) 

        dirs = path.split('/') 

        dirs = map(unquote, dirs) 

        dirs, file = dirs[:-1], dirs[-1] 

        if dirs and not dirs[0]: 

            dirs = dirs[1:] 

        try: 

            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) 

            type = file and 'I' or 'D' 

            for attr in attrs: 

                attr, value = splitvalue(attr) 

                if attr.lower() == 'type' and \ 

                   value in ('a', 'A', 'i', 'I', 'd', 'D'): 

                    type = value.upper() 

            fp, retrlen = fw.retrfile(file, type) 

            headers = "" 

            mtype = mimetypes.guess_type(req.get_full_url())[0] 

            if mtype: 

                headers += "Content-type: %s\n" % mtype 

            if retrlen is not None and retrlen >= 0: 

                headers += "Content-length: %d\n" % retrlen 

            sf = StringIO(headers) 

            headers = mimetools.Message(sf) 

            return addinfourl(fp, headers, req.get_full_url()) 

        except ftplib.all_errors, msg: 

            raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2] 

 

    def connect_ftp(self, user, passwd, host, port, dirs, timeout): 

        fw = ftpwrapper(user, passwd, host, port, dirs, timeout, 

                        persistent=False) 

##        fw.ftp.set_debuglevel(1) 

        return fw 

 

class CacheFTPHandler(FTPHandler): 

    # XXX would be nice to have pluggable cache strategies 

    # XXX this stuff is definitely not thread safe 

    def __init__(self): 

        self.cache = {} 

        self.timeout = {} 

        self.soonest = 0 

        self.delay = 60 

        self.max_conns = 16 

 

    def setTimeout(self, t): 

        self.delay = t 

 

    def setMaxConns(self, m): 

        self.max_conns = m 

 

    def connect_ftp(self, user, passwd, host, port, dirs, timeout): 

        key = user, host, port, '/'.join(dirs), timeout 

        if key in self.cache: 

            self.timeout[key] = time.time() + self.delay 

        else: 

            self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout) 

            self.timeout[key] = time.time() + self.delay 

        self.check_cache() 

        return self.cache[key] 

 

    def check_cache(self): 

        # first check for old ones 

        t = time.time() 

        if self.soonest <= t: 

            for k, v in self.timeout.items(): 

                if v < t: 

                    self.cache[k].close() 

                    del self.cache[k] 

                    del self.timeout[k] 

        self.soonest = min(self.timeout.values()) 

 

        # then check the size 

        if len(self.cache) == self.max_conns: 

            for k, v in self.timeout.items(): 

                if v == self.soonest: 

                    del self.cache[k] 

                    del self.timeout[k] 

                    break 

            self.soonest = min(self.timeout.values()) 

 

    def clear_cache(self): 

        for conn in self.cache.values(): 

            conn.close() 

        self.cache.clear() 

        self.timeout.clear()