Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

1001

1002

1003

1004

1005

1006

1007

1008

1009

1010

1011

1012

1013

1014

1015

1016

1017

1018

1019

1020

1021

1022

1023

1024

1025

1026

1027

1028

1029

1030

1031

1032

1033

1034

1035

1036

1037

1038

1039

1040

1041

1042

1043

1044

1045

1046

1047

1048

1049

1050

1051

1052

1053

1054

1055

1056

1057

1058

1059

1060

1061

1062

1063

1064

1065

1066

1067

1068

1069

1070

1071

1072

1073

1074

1075

1076

1077

1078

1079

1080

1081

1082

1083

1084

1085

1086

1087

1088

1089

1090

1091

1092

1093

1094

1095

1096

1097

1098

1099

1100

1101

1102

1103

1104

1105

1106

1107

1108

1109

1110

1111

1112

1113

1114

1115

1116

1117

1118

1119

1120

1121

1122

1123

1124

1125

1126

1127

1128

1129

1130

1131

1132

1133

1134

1135

1136

1137

1138

1139

1140

1141

1142

1143

1144

1145

1146

1147

1148

1149

1150

1151

1152

1153

1154

1155

1156

1157

1158

1159

1160

1161

1162

1163

1164

1165

1166

1167

1168

1169

1170

1171

1172

1173

1174

1175

1176

1177

1178

1179

1180

1181

1182

1183

1184

1185

1186

1187

1188

1189

1190

1191

1192

1193

1194

1195

1196

1197

1198

1199

1200

1201

1202

1203

1204

1205

1206

1207

1208

1209

1210

1211

1212

1213

1214

1215

1216

1217

1218

1219

1220

1221

1222

1223

1224

1225

1226

1227

1228

1229

1230

1231

1232

1233

1234

1235

1236

1237

1238

1239

1240

1241

1242

1243

1244

1245

1246

1247

1248

1249

1250

1251

1252

1253

1254

1255

1256

1257

1258

1259

1260

1261

1262

1263

1264

1265

1266

1267

1268

1269

1270

1271

1272

1273

1274

1275

1276

1277

1278

1279

1280

1281

1282

1283

1284

1285

1286

1287

1288

1289

1290

1291

1292

1293

1294

1295

1296

1297

1298

1299

1300

1301

1302

1303

1304

1305

1306

1307

1308

1309

1310

1311

1312

1313

1314

1315

1316

1317

1318

1319

1320

1321

1322

1323

1324

1325

1326

1327

1328

1329

1330

1331

1332

1333

1334

1335

1336

1337

1338

1339

1340

1341

1342

1343

1344

1345

1346

1347

1348

1349

1350

1351

1352

1353

1354

1355

1356

1357

1358

1359

1360

1361

1362

1363

1364

1365

1366

1367

1368

1369

1370

1371

1372

1373

1374

1375

1376

1377

1378

1379

1380

1381

1382

1383

1384

1385

1386

1387

1388

1389

1390

1391

1392

1393

1394

1395

1396

1397

1398

1399

1400

1401

1402

1403

1404

1405

1406

1407

1408

1409

1410

1411

1412

1413

1414

1415

1416

1417

1418

1419

1420

1421

1422

1423

1424

1425

1426

1427

1428

1429

1430

1431

1432

1433

1434

1435

1436

1437

1438

1439

1440

1441

1442

1443

1444

1445

1446

1447

1448

1449

1450

1451

1452

1453

1454

1455

1456

1457

1458

1459

1460

1461

1462

1463

1464

1465

1466

1467

1468

1469

1470

1471

1472

1473

1474

1475

1476

1477

1478

1479

1480

1481

1482

1483

1484

1485

1486

1487

1488

1489

1490

1491

1492

1493

1494

1495

1496

1497

1498

1499

1500

1501

1502

1503

1504

1505

1506

1507

1508

1509

1510

1511

1512

1513

1514

1515

1516

1517

1518

1519

1520

1521

1522

1523

1524

1525

1526

1527

1528

1529

1530

1531

1532

1533

1534

1535

1536

1537

1538

1539

1540

1541

1542

1543

1544

1545

1546

1547

1548

1549

1550

1551

1552

1553

1554

1555

1556

1557

1558

# Copyright 2009-2010 by Peter Cock.  All rights reserved. 

# Based on code contributed and copyright 2009 by Jose Blanca (COMAV-UPV). 

# 

# This code is part of the Biopython distribution and governed by its 

# license.  Please see the LICENSE file that should have been included 

# as part of this package. 

"""Bio.SeqIO support for the binary Standard Flowgram Format (SFF) file format. 

 

SFF was designed by 454 Life Sciences (Roche), the Whitehead Institute for 

Biomedical Research and the Wellcome Trust Sanger Institute. SFF was also used 

as the native output format from early versions of Ion Torrent's PGM platform 

as well. You are expected to use this module via the Bio.SeqIO functions under 

the format name "sff" (or "sff-trim" as described below). 

 

For example, to iterate over the records in an SFF file, 

 

    >>> from Bio import SeqIO 

    >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"): 

    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20])) 

    ... 

    E3MFGYR02JWQ7T 265 tcagGGTCTACATGTTGGTT... 

    E3MFGYR02JA6IL 271 tcagTTTTTTTTGGAAAGGA... 

    E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC... 

    E3MFGYR02GFKUC 299 tcagCGGCCGGGCCTCTCAT... 

    E3MFGYR02FTGED 281 tcagTGGTAATGGGGGGAAA... 

    E3MFGYR02FR9G7 261 tcagCTCCGTAAGAAGGTGC... 

    E3MFGYR02GAZMS 278 tcagAAAGAAGTAAGGTAAA... 

    E3MFGYR02HHZ8O 221 tcagACTTTCTTCTTTACCG... 

    E3MFGYR02GPGB1 269 tcagAAGCAGTGGTATCAAC... 

    E3MFGYR02F7Z7G 219 tcagAATCATCCACTTTTTA... 

 

Each SeqRecord object will contain all the annotation from the SFF file, 

including the PHRED quality scores. 

 

    >>> print("%s %i" % (record.id, len(record))) 

    E3MFGYR02F7Z7G 219 

    >>> print("%s..." % record.seq[:10]) 

    tcagAATCAT... 

    >>> print("%r..." % (record.letter_annotations["phred_quality"][:10])) 

    [22, 21, 23, 28, 26, 15, 12, 21, 28, 21]... 

 

Notice that the sequence is given in mixed case, the central upper case region 

corresponds to the trimmed sequence. This matches the output of the Roche 

tools (and the 3rd party tool sff_extract) for SFF to FASTA. 

 

    >>> print(record.annotations["clip_qual_left"]) 

    4 

    >>> print(record.annotations["clip_qual_right"]) 

    134 

    >>> print(record.seq[:4]) 

    tcag 

    >>> print("%s...%s" % (record.seq[4:20], record.seq[120:134])) 

    AATCATCCACTTTTTA...CAAAACACAAACAG 

    >>> print(record.seq[134:]) 

    atcttatcaacaaaactcaaagttcctaactgagacacgcaacaggggataagacaaggcacacaggggataggnnnnnnnnnnn 

 

The annotations dictionary also contains any adapter clip positions 

(usually zero), and information about the flows. e.g. 

 

    >>> len(record.annotations) 

    11 

    >>> print(record.annotations["flow_key"]) 

    TCAG 

    >>> print(record.annotations["flow_values"][:10]) 

    (83, 1, 128, 7, 4, 84, 6, 106, 3, 172) 

    >>> print(len(record.annotations["flow_values"])) 

    400 

    >>> print(record.annotations["flow_index"][:10]) 

    (1, 2, 3, 2, 2, 0, 3, 2, 3, 3) 

    >>> print(len(record.annotations["flow_index"])) 

    219 

 

Note that to convert from a raw reading in flow_values to the corresponding 

homopolymer stretch estimate, the value should be rounded to the nearest 100: 

 

    >>> print("%r..." % [int(round(value, -2)) // 100 

    ...                  for value in record.annotations["flow_values"][:10]]) 

    ... 

    [1, 0, 1, 0, 0, 1, 0, 1, 0, 2]... 

 

If a read name is exactly 14 alphanumeric characters, the annotations 

dictionary will also contain meta-data about the read extracted by 

interpretting the name as a 454 Sequencing System "Universal" Accession 

Number. Note that if a read name happens to be exactly 14 alphanumeric 

characters but was not generated automatically, these annotation records 

will contain nonsense information. 

 

    >>> print(record.annotations["region"]) 

    2 

    >>> print(record.annotations["time"]) 

    [2008, 1, 9, 16, 16, 0] 

    >>> print(record.annotations["coords"]) 

    (2434, 1658) 

 

As a convenience method, you can read the file with SeqIO format name "sff-trim" 

instead of "sff" to get just the trimmed sequences (without any annotation 

except for the PHRED quality scores and anything encoded in the read names): 

 

    >>> from Bio import SeqIO 

    >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim"): 

    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20])) 

    ... 

    E3MFGYR02JWQ7T 260 GGTCTACATGTTGGTTAACC... 

    E3MFGYR02JA6IL 265 TTTTTTTTGGAAAGGAAAAC... 

    E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG... 

    E3MFGYR02GFKUC 295 CGGCCGGGCCTCTCATCGGT... 

    E3MFGYR02FTGED 277 TGGTAATGGGGGGAAATTTA... 

    E3MFGYR02FR9G7 256 CTCCGTAAGAAGGTGCTGCC... 

    E3MFGYR02GAZMS 271 AAAGAAGTAAGGTAAATAAC... 

    E3MFGYR02HHZ8O 150 ACTTTCTTCTTTACCGTAAC... 

    E3MFGYR02GPGB1 221 AAGCAGTGGTATCAACGCAG... 

    E3MFGYR02F7Z7G 130 AATCATCCACTTTTTAACGT... 

 

Looking at the final record in more detail, note how this differs to the 

example above: 

 

    >>> print("%s %i" % (record.id, len(record))) 

    E3MFGYR02F7Z7G 130 

    >>> print("%s..." % record.seq[:10]) 

    AATCATCCAC... 

    >>> print("%r..." % record.letter_annotations["phred_quality"][:10]) 

    [26, 15, 12, 21, 28, 21, 36, 28, 27, 27]... 

    >>> len(record.annotations) 

    3 

    >>> print(record.annotations["region"]) 

    2 

    >>> print(record.annotations["coords"]) 

    (2434, 1658) 

    >>> print(record.annotations["time"]) 

    [2008, 1, 9, 16, 16, 0] 

 

You might use the Bio.SeqIO.convert() function to convert the (trimmed) SFF 

reads into a FASTQ file (or a FASTA file and a QUAL file), e.g. 

 

    >>> from Bio import SeqIO 

    >>> try: 

    ...     from StringIO import StringIO # Python 2 

    ... except ImportError: 

    ...     from io import StringIO # Python 3 

    ... 

    >>> out_handle = StringIO() 

    >>> count = SeqIO.convert("Roche/E3MFGYR02_random_10_reads.sff", "sff", 

    ...                       out_handle, "fastq") 

    ... 

    >>> print("Converted %i records" % count) 

    Converted 10 records 

 

The output FASTQ file would start like this: 

 

    >>> print("%s..." % out_handle.getvalue()[:50]) 

    @E3MFGYR02JWQ7T 

    tcagGGTCTACATGTTGGTTAACCCGTACTGATT... 

 

Bio.SeqIO.index() provides memory efficient random access to the reads in an 

SFF file by name. SFF files can include an index within the file, which can 

be read in making this very fast. If the index is missing (or in a format not 

yet supported in Biopython) the file is indexed by scanning all the reads - 

which is a little slower. For example, 

 

    >>> from Bio import SeqIO 

    >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff") 

    >>> record = reads["E3MFGYR02JHD4H"] 

    >>> print("%s %i %s..." % (record.id, len(record), record.seq[:20])) 

    E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC... 

    >>> reads.close() 

 

Or, using the trimmed reads: 

 

    >>> from Bio import SeqIO 

    >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim") 

    >>> record = reads["E3MFGYR02JHD4H"] 

    >>> print("%s %i %s..." % (record.id, len(record), record.seq[:20])) 

    E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG... 

    >>> reads.close() 

 

You can also use the Bio.SeqIO.write() function with the "sff" format. Note 

that this requires all the flow information etc, and thus is probably only 

useful for SeqRecord objects originally from reading another SFF file (and 

not the trimmed SeqRecord objects from parsing an SFF file as "sff-trim"). 

 

As an example, let's pretend this example SFF file represents some DNA which 

was pre-amplified with a PCR primers AAAGANNNNN. The following script would 

produce a sub-file containing all those reads whose post-quality clipping 

region (i.e. the sequence after trimming) starts with AAAGA exactly (the non- 

degenerate bit of this pretend primer): 

 

    >>> from Bio import SeqIO 

    >>> records = (record for record in 

    ...            SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff") 

    ...            if record.seq[record.annotations["clip_qual_left"]:].startswith("AAAGA")) 

    ... 

    >>> count = SeqIO.write(records, "temp_filtered.sff", "sff") 

    >>> print("Selected %i records" % count) 

    Selected 2 records 

 

Of course, for an assembly you would probably want to remove these primers. 

If you want FASTA or FASTQ output, you could just slice the SeqRecord. However, 

if you want SFF output we have to preserve all the flow information - the trick 

is just to adjust the left clip position! 

 

    >>> from Bio import SeqIO 

    >>> def filter_and_trim(records, primer): 

    ...     for record in records: 

    ...         if record.seq[record.annotations["clip_qual_left"]:].startswith(primer): 

    ...             record.annotations["clip_qual_left"] += len(primer) 

    ...             yield record 

    ... 

    >>> records = SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff") 

    >>> count = SeqIO.write(filter_and_trim(records, "AAAGA"), 

    ...                     "temp_filtered.sff", "sff") 

    ... 

    >>> print("Selected %i records" % count) 

    Selected 2 records 

 

We can check the results, note the lower case clipped region now includes the "AAAGA" 

sequence: 

 

    >>> for record in SeqIO.parse("temp_filtered.sff", "sff"): 

    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20])) 

    ... 

    E3MFGYR02JHD4H 310 tcagaaagaCAAGTGGTATC... 

    E3MFGYR02GAZMS 278 tcagaaagaAGTAAGGTAAA... 

    >>> for record in SeqIO.parse("temp_filtered.sff", "sff-trim"): 

    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20])) 

    ... 

    E3MFGYR02JHD4H 287 CAAGTGGTATCAACGCAGAG... 

    E3MFGYR02GAZMS 266 AGTAAGGTAAATAACAAACG... 

    >>> import os 

    >>> os.remove("temp_filtered.sff") 

 

For a description of the file format, please see the Roche manuals and: 

http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats 

 

""" 

 

from __future__ import print_function 

 

from Bio.SeqIO.Interfaces import SequenceWriter 

from Bio import Alphabet 

from Bio.Seq import Seq 

from Bio.SeqRecord import SeqRecord 

import struct 

import sys 

import re 

 

from Bio._py3k import _bytes_to_string, _as_bytes 

_null = b"\0" 

_sff = b".sff" 

_hsh = b".hsh" 

_srt = b".srt" 

_mft = b".mft" 

_flag = b"\xff" 

 

 

def _sff_file_header(handle): 

    """Read in an SFF file header (PRIVATE). 

 

    Assumes the handle is at the start of the file, will read forwards 

    though the header and leave the handle pointing at the first record. 

    Returns a tuple of values from the header (header_length, index_offset, 

    index_length, number_of_reads, flows_per_read, flow_chars, key_sequence) 

 

    >>> with open("Roche/greek.sff", "rb") as handle: 

    ...     values = _sff_file_header(handle) 

    ... 

    >>> print(values[0]) 

    840 

    >>> print(values[1]) 

    65040 

    >>> print(values[2]) 

    256 

    >>> print(values[3]) 

    24 

    >>> print(values[4]) 

    800 

    >>> values[-1] 

    'TCAG' 

 

    """ 

    if hasattr(handle, "mode") and "U" in handle.mode.upper(): 

        raise ValueError("SFF files must NOT be opened in universal new " 

                         "lines mode. Binary mode is recommended (although " 

                         "on Unix the default mode is also fine).") 

    elif hasattr(handle, "mode") and "B" not in handle.mode.upper() \ 

            and sys.platform == "win32": 

        raise ValueError("SFF files must be opened in binary mode on Windows") 

    #file header (part one) 

    #use big endiean encdoing   > 

    #magic_number               I 

    #version                    4B 

    #index_offset               Q 

    #index_length               I 

    #number_of_reads            I 

    #header_length              H 

    #key_length                 H 

    #number_of_flows_per_read   H 

    #flowgram_format_code       B 

    #[rest of file header depends on the number of flows and how many keys] 

    fmt = '>4s4BQIIHHHB' 

    assert 31 == struct.calcsize(fmt) 

    data = handle.read(31) 

    if not data: 

        raise ValueError("Empty file.") 

    elif len(data) < 13: 

        raise ValueError("File too small to hold a valid SFF header.") 

    magic_number, ver0, ver1, ver2, ver3, index_offset, index_length, \ 

        number_of_reads, header_length, key_length, number_of_flows_per_read, \ 

        flowgram_format = struct.unpack(fmt, data) 

    if magic_number in [_hsh, _srt, _mft]: 

        #Probably user error, calling Bio.SeqIO.parse() twice! 

        raise ValueError("Handle seems to be at SFF index block, not start") 

    if magic_number != _sff:  # 779314790 

        raise ValueError("SFF file did not start '.sff', but %s" 

                         % repr(magic_number)) 

    if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1): 

        raise ValueError("Unsupported SFF version in header, %i.%i.%i.%i" 

                         % (ver0, ver1, ver2, ver3)) 

    if flowgram_format != 1: 

        raise ValueError("Flowgram format code %i not supported" 

                         % flowgram_format) 

    if (index_offset != 0) ^ (index_length != 0): 

        raise ValueError("Index offset %i but index length %i" 

                         % (index_offset, index_length)) 

    flow_chars = _bytes_to_string(handle.read(number_of_flows_per_read)) 

    key_sequence = _bytes_to_string(handle.read(key_length)) 

    #According to the spec, the header_length field should be the total number 

    #of bytes required by this set of header fields, and should be equal to 

    #"31 + number_of_flows_per_read + key_length" rounded up to the next value 

    #divisible by 8. 

    assert header_length % 8 == 0 

    padding = header_length - number_of_flows_per_read - key_length - 31 

    assert 0 <= padding < 8, padding 

    if handle.read(padding).count(_null) != padding: 

        import warnings 

        from Bio import BiopythonParserWarning 

        warnings.warn("Your SFF file is invalid, post header %i byte " 

                      "null padding region contained data." % padding, 

                      BiopythonParserWarning) 

    return header_length, index_offset, index_length, \ 

        number_of_reads, number_of_flows_per_read, \ 

        flow_chars, key_sequence 

 

 

def _sff_do_slow_index(handle): 

    """Generates an index by scanning though all the reads in an SFF file (PRIVATE). 

 

    This is a slow but generic approach if we can't parse the provided index 

    (if present). 

 

    Will use the handle seek/tell functions. 

    """ 

    handle.seek(0) 

    header_length, index_offset, index_length, number_of_reads, \ 

        number_of_flows_per_read, flow_chars, key_sequence \ 

        = _sff_file_header(handle) 

    #Now on to the reads... 

    read_header_fmt = '>2HI4H' 

    read_header_size = struct.calcsize(read_header_fmt) 

    #NOTE - assuming flowgram_format==1, which means struct type H 

    read_flow_fmt = ">%iH" % number_of_flows_per_read 

    read_flow_size = struct.calcsize(read_flow_fmt) 

    assert 1 == struct.calcsize(">B") 

    assert 1 == struct.calcsize(">s") 

    assert 1 == struct.calcsize(">c") 

    assert read_header_size % 8 == 0  # Important for padding calc later! 

    for read in range(number_of_reads): 

        record_offset = handle.tell() 

        if record_offset == index_offset: 

            #Found index block within reads, ignore it: 

            offset = index_offset + index_length 

            if offset % 8: 

                offset += 8 - (offset % 8) 

            assert offset % 8 == 0 

            handle.seek(offset) 

            record_offset = offset 

        #assert record_offset%8 == 0 #Worth checking, but slow 

        #First the fixed header 

        data = handle.read(read_header_size) 

        read_header_length, name_length, seq_len, clip_qual_left, \ 

            clip_qual_right, clip_adapter_left, clip_adapter_right \ 

            = struct.unpack(read_header_fmt, data) 

        if read_header_length < 10 or read_header_length % 8 != 0: 

            raise ValueError("Malformed read header, says length is %i:\n%s" 

                             % (read_header_length, repr(data))) 

        #now the name and any padding (remainder of header) 

        name = _bytes_to_string(handle.read(name_length)) 

        padding = read_header_length - read_header_size - name_length 

        if handle.read(padding).count(_null) != padding: 

            import warnings 

            from Bio import BiopythonParserWarning 

            warnings.warn("Your SFF file is invalid, post name %i byte " 

                          "padding region contained data" % padding, 

                          BiopythonParserWarning) 

        assert record_offset + read_header_length == handle.tell() 

        #now the flowgram values, flowgram index, bases and qualities 

        size = read_flow_size + 3 * seq_len 

        handle.seek(size, 1) 

        #now any padding... 

        padding = size % 8 

        if padding: 

            padding = 8 - padding 

            if handle.read(padding).count(_null) != padding: 

                import warnings 

                from Bio import BiopythonParserWarning 

                warnings.warn("Your SFF file is invalid, post quality %i " 

                              "byte padding region contained data" % padding, 

                              BiopythonParserWarning) 

        #print("%s %s %i" % (read, name, record_offset)) 

        yield name, record_offset 

    if handle.tell() % 8 != 0: 

        raise ValueError( 

            "After scanning reads, did not end on a multiple of 8") 

 

 

def _sff_find_roche_index(handle): 

    """Locate any existing Roche style XML meta data and read index (PRIVATE). 

 

    Makes a number of hard coded assumptions based on reverse engineered SFF 

    files from Roche 454 machines. 

 

    Returns a tuple of read count, SFF "index" offset and size, XML offset 

    and size, and the actual read index offset and size. 

 

    Raises a ValueError for unsupported or non-Roche index blocks. 

    """ 

    handle.seek(0) 

    header_length, index_offset, index_length, number_of_reads, \ 

        number_of_flows_per_read, flow_chars, key_sequence \ 

        = _sff_file_header(handle) 

    assert handle.tell() == header_length 

    if not index_offset or not index_offset: 

        raise ValueError("No index present in this SFF file") 

    #Now jump to the header... 

    handle.seek(index_offset) 

    fmt = ">4s4B" 

    fmt_size = struct.calcsize(fmt) 

    data = handle.read(fmt_size) 

    if not data: 

        raise ValueError("Premature end of file? Expected index of size %i at offest %i, found nothing" 

                         % (index_length, index_offset)) 

    if len(data) < fmt_size: 

        raise ValueError("Premature end of file? Expected index of size %i at offest %i, found %s" 

                         % (index_length, index_offset, repr(data))) 

    magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data) 

    if magic_number == _mft:  # 778921588 

        #Roche 454 manifest index 

        #This is typical from raw Roche 454 SFF files (2009), and includes 

        #both an XML manifest and the sorted index. 

        if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48): 

            #This is "1.00" as a string 

            raise ValueError("Unsupported version in .mft index header, %i.%i.%i.%i" 

                             % (ver0, ver1, ver2, ver3)) 

        fmt2 = ">LL" 

        fmt2_size = struct.calcsize(fmt2) 

        xml_size, data_size = struct.unpack(fmt2, handle.read(fmt2_size)) 

        if index_length != fmt_size + fmt2_size + xml_size + data_size: 

            raise ValueError("Problem understanding .mft index header, %i != %i + %i + %i + %i" 

                             % (index_length, fmt_size, fmt2_size, xml_size, data_size)) 

        return number_of_reads, header_length, \ 

            index_offset, index_length, \ 

            index_offset + fmt_size + fmt2_size, xml_size, \ 

            index_offset + fmt_size + fmt2_size + xml_size, data_size 

    elif magic_number == _srt:  # 779317876 

        #Roche 454 sorted index 

        #I've had this from Roche tool sfffile when the read identifiers 

        #had nonstandard lengths and there was no XML manifest. 

        if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48): 

            #This is "1.00" as a string 

            raise ValueError("Unsupported version in .srt index header, %i.%i.%i.%i" 

                             % (ver0, ver1, ver2, ver3)) 

        data = handle.read(4) 

        if data != _null * 4: 

            raise ValueError( 

                "Did not find expected null four bytes in .srt index") 

        return number_of_reads, header_length, \ 

            index_offset, index_length, \ 

            0, 0, \ 

            index_offset + fmt_size + 4, index_length - fmt_size - 4 

    elif magic_number == _hsh: 

        raise ValueError("Hash table style indexes (.hsh) in SFF files are " 

                         "not (yet) supported") 

    else: 

        raise ValueError("Unknown magic number %s in SFF index header:\n%s" 

                         % (repr(magic_number), repr(data))) 

 

 

def ReadRocheXmlManifest(handle): 

    """Reads any Roche style XML manifest data in the SFF "index". 

 

    The SFF file format allows for multiple different index blocks, and Roche 

    took advantage of this to define their own index block which also embeds 

    an XML manifest string. This is not a publically documented extension to 

    the SFF file format, this was reverse engineered. 

 

    The handle should be to an SFF file opened in binary mode. This function 

    will use the handle seek/tell functions and leave the handle in an 

    arbitrary location. 

 

    Any XML manifest found is returned as a Python string, which you can then 

    parse as appropriate, or reuse when writing out SFF files with the 

    SffWriter class. 

 

    Returns a string, or raises a ValueError if an Roche manifest could not be 

    found. 

    """ 

    number_of_reads, header_length, index_offset, index_length, xml_offset, \ 

        xml_size, read_index_offset, read_index_size = _sff_find_roche_index( 

            handle) 

    if not xml_offset or not xml_size: 

        raise ValueError("No XML manifest found") 

    handle.seek(xml_offset) 

    return _bytes_to_string(handle.read(xml_size)) 

 

 

#This is a generator function! 

def _sff_read_roche_index(handle): 

    """Reads any existing Roche style read index provided in the SFF file (PRIVATE). 

 

    Will use the handle seek/tell functions. 

 

    This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks. 

 

    Roche SFF indices use base 255 not 256, meaning we see bytes in range the 

    range 0 to 254 only. This appears to be so that byte 0xFF (character 255) 

    can be used as a marker character to separate entries (required if the 

    read name lengths vary). 

 

    Note that since only four bytes are used for the read offset, this is 

    limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile 

    tool to combine SFF files beyound this limit, they issue a warning and 

    omit the index (and manifest). 

    """ 

    number_of_reads, header_length, index_offset, index_length, xml_offset, \ 

        xml_size, read_index_offset, read_index_size = _sff_find_roche_index( 

            handle) 

    #Now parse the read index... 

    handle.seek(read_index_offset) 

    fmt = ">5B" 

    for read in range(number_of_reads): 

        #TODO - Be more aware of when the index should end? 

        data = handle.read(6) 

        while True: 

            more = handle.read(1) 

            if not more: 

                raise ValueError("Premature end of file!") 

            data += more 

            if more == _flag: 

                break 

        assert data[-1:] == _flag, data[-1:] 

        name = _bytes_to_string(data[:-6]) 

        off4, off3, off2, off1, off0 = struct.unpack(fmt, data[-6:-1]) 

        offset = off0 + 255 * off1 + 65025 * off2 + 16581375 * off3 

        if off4: 

            #Could in theory be used as a fifth piece of offset information, 

            #i.e. offset =+ 4228250625L*off4, but testing the Roche tools this 

            #is not the case. They simple don't support such large indexes. 

            raise ValueError("Expected a null terminator to the read name.") 

        yield name, offset 

    if handle.tell() != read_index_offset + read_index_size: 

        raise ValueError("Problem with index length? %i vs %i" 

                         % (handle.tell(), read_index_offset + read_index_size)) 

 

_valid_UAN_read_name = re.compile(r'^[a-zA-Z0-9]{14}$') 

 

 

def _sff_read_seq_record(handle, number_of_flows_per_read, flow_chars, 

                         key_sequence, alphabet, trim=False): 

    """Parse the next read in the file, return data as a SeqRecord (PRIVATE).""" 

    #Now on to the reads... 

    #the read header format (fixed part): 

    #read_header_length     H 

    #name_length            H 

    #seq_len                I 

    #clip_qual_left         H 

    #clip_qual_right        H 

    #clip_adapter_left      H 

    #clip_adapter_right     H 

    #[rest of read header depends on the name length etc] 

    read_header_fmt = '>2HI4H' 

    read_header_size = struct.calcsize(read_header_fmt) 

    read_flow_fmt = ">%iH" % number_of_flows_per_read 

    read_flow_size = struct.calcsize(read_flow_fmt) 

 

    read_header_length, name_length, seq_len, clip_qual_left, \ 

        clip_qual_right, clip_adapter_left, clip_adapter_right \ 

        = struct.unpack(read_header_fmt, handle.read(read_header_size)) 

    if clip_qual_left: 

        clip_qual_left -= 1  # python counting 

    if clip_adapter_left: 

        clip_adapter_left -= 1  # python counting 

    if read_header_length < 10 or read_header_length % 8 != 0: 

        raise ValueError("Malformed read header, says length is %i" 

                         % read_header_length) 

    #now the name and any padding (remainder of header) 

    name = _bytes_to_string(handle.read(name_length)) 

    padding = read_header_length - read_header_size - name_length 

    if handle.read(padding).count(_null) != padding: 

        import warnings 

        from Bio import BiopythonParserWarning 

        warnings.warn("Your SFF file is invalid, post name %i " 

                      "byte padding region contained data" % padding, 

                      BiopythonParserWarning) 

    #now the flowgram values, flowgram index, bases and qualities 

    #NOTE - assuming flowgram_format==1, which means struct type H 

    flow_values = handle.read(read_flow_size)  # unpack later if needed 

    temp_fmt = ">%iB" % seq_len  # used for flow index and quals 

    flow_index = handle.read(seq_len)  # unpack later if needed 

    seq = _bytes_to_string(handle.read(seq_len))  # TODO - Use bytes in Seq? 

    quals = list(struct.unpack(temp_fmt, handle.read(seq_len))) 

    #now any padding... 

    padding = (read_flow_size + seq_len * 3) % 8 

    if padding: 

        padding = 8 - padding 

        if handle.read(padding).count(_null) != padding: 

            import warnings 

            from Bio import BiopythonParserWarning 

            warnings.warn("Your SFF file is invalid, post quality %i " 

                          "byte padding region contained data" % padding, 

                          BiopythonParserWarning) 

    #Follow Roche and apply most aggressive of qual and adapter clipping. 

    #Note Roche seems to ignore adapter clip fields when writing SFF, 

    #and uses just the quality clipping values for any clipping. 

    clip_left = max(clip_qual_left, clip_adapter_left) 

    #Right clipping of zero means no clipping 

    if clip_qual_right: 

        if clip_adapter_right: 

            clip_right = min(clip_qual_right, clip_adapter_right) 

        else: 

            #Typical case with Roche SFF files 

            clip_right = clip_qual_right 

    elif clip_adapter_right: 

        clip_right = clip_adapter_right 

    else: 

        clip_right = seq_len 

    #Now build a SeqRecord 

    if trim: 

        if clip_left >= clip_right: 

            # Raise an error? 

            import warnings 

            from Bio import BiopythonParserWarning 

            warnings.warn("Overlapping clip values in SFF record, trimmed to nothing", 

                          BiopythonParserWarning) 

            seq = "" 

            quals = [] 

        else: 

            seq = seq[clip_left:clip_right].upper() 

            quals = quals[clip_left:clip_right] 

        #Don't record the clipping values, flow etc, they make no sense now: 

        annotations = {} 

    else: 

        if clip_left >= clip_right: 

            import warnings 

            from Bio import BiopythonParserWarning 

            warnings.warn("Overlapping clip values in SFF record", BiopythonParserWarning) 

            seq = seq.lower() 

        else: 

            #This use of mixed case mimics the Roche SFF tool's FASTA output 

            seq = seq[:clip_left].lower() + \ 

                seq[clip_left:clip_right].upper() + \ 

                seq[clip_right:].lower() 

        annotations = {"flow_values": struct.unpack(read_flow_fmt, flow_values), 

                       "flow_index": struct.unpack(temp_fmt, flow_index), 

                       "flow_chars": flow_chars, 

                       "flow_key": key_sequence, 

                       "clip_qual_left": clip_qual_left, 

                       "clip_qual_right": clip_qual_right, 

                       "clip_adapter_left": clip_adapter_left, 

                       "clip_adapter_right": clip_adapter_right} 

    if re.match(_valid_UAN_read_name, name): 

        annotations["time"] = _get_read_time(name) 

        annotations["region"] = _get_read_region(name) 

        annotations["coords"] = _get_read_xy(name) 

    record = SeqRecord(Seq(seq, alphabet), 

                       id=name, 

                       name=name, 

                       description="", 

                       annotations=annotations) 

    #Dirty trick to speed up this line: 

    #record.letter_annotations["phred_quality"] = quals 

    dict.__setitem__(record._per_letter_annotations, 

                     "phred_quality", quals) 

    #Return the record and then continue... 

    return record 

 

_powers_of_36 = [36 ** i for i in range(6)] 

 

 

def _string_as_base_36(string): 

    """Interpret a string as a base-36 number as per 454 manual.""" 

    total = 0 

    for c, power in zip(string[::-1], _powers_of_36): 

        # For reference: ord('0') = 48, ord('9') = 57 

        # For reference: ord('A') = 65, ord('Z') = 90 

        # For reference: ord('a') = 97, ord('z') = 122 

        if 48 <= ord(c) <= 57: 

            val = ord(c) - 22  # equivalent to: - ord('0') + 26 

        elif 65 <= ord(c) <= 90: 

            val = ord(c) - 65 

        elif 97 <= ord(c) <= 122: 

            val = ord(c) - 97 

        else: 

            # Invalid character 

            val = 0 

        total += val * power 

    return total 

 

 

def _get_read_xy(read_name): 

    """Extract coordinates from last 5 characters of read name.""" 

    number = _string_as_base_36(read_name[9:]) 

    return divmod(number, 4096) 

 

_time_denominators = [13 * 32 * 24 * 60 * 60, 

                      32 * 24 * 60 * 60, 

                      24 * 60 * 60, 

                      60 * 60, 

                      60] 

 

 

def _get_read_time(read_name): 

    """Extract time from first 6 characters of read name.""" 

    time_list = [] 

    remainder = _string_as_base_36(read_name[:6]) 

    for denominator in _time_denominators: 

        this_term, remainder = divmod(remainder, denominator) 

        time_list.append(this_term) 

    time_list.append(remainder) 

    time_list[0] += 2000 

    return time_list 

 

 

def _get_read_region(read_name): 

    """Extract region from read name.""" 

    return int(read_name[8]) 

 

 

def _sff_read_raw_record(handle, number_of_flows_per_read): 

    """Extract the next read in the file as a raw (bytes) string (PRIVATE).""" 

    read_header_fmt = '>2HI' 

    read_header_size = struct.calcsize(read_header_fmt) 

    read_flow_fmt = ">%iH" % number_of_flows_per_read 

    read_flow_size = struct.calcsize(read_flow_fmt) 

 

    raw = handle.read(read_header_size) 

    read_header_length, name_length, seq_len \ 

        = struct.unpack(read_header_fmt, raw) 

    if read_header_length < 10 or read_header_length % 8 != 0: 

        raise ValueError("Malformed read header, says length is %i" 

                         % read_header_length) 

    #now the four clip values (4H = 8 bytes), and read name 

    raw += handle.read(8 + name_length) 

    #and any padding (remainder of header) 

    padding = read_header_length - read_header_size - 8 - name_length 

    pad = handle.read(padding) 

    if pad.count(_null) != padding: 

        import warnings 

        from Bio import BiopythonParserWarning 

        warnings.warn("Your SFF file is invalid, post name %i " 

                      "byte padding region contained data" % padding, 

                      BiopythonParserWarning) 

    raw += pad 

    #now the flowgram values, flowgram index, bases and qualities 

    raw += handle.read(read_flow_size + seq_len * 3) 

    padding = (read_flow_size + seq_len * 3) % 8 

    #now any padding... 

    if padding: 

        padding = 8 - padding 

        pad = handle.read(padding) 

        if pad.count(_null) != padding: 

            import warnings 

            from Bio import BiopythonParserWarning 

            warnings.warn("Your SFF file is invalid, post quality %i " 

                          "byte padding region contained data" % padding, 

                          BiopythonParserWarning) 

        raw += pad 

    #Return the raw bytes 

    return raw 

 

 

class _AddTellHandle(object): 

    """Wrapper for handles which do not support the tell method (PRIVATE). 

 

    Intended for use with things like network handles where tell (and reverse 

    seek) are not supported. The SFF file needs to track the current offset in 

    order to deal with the index block. 

    """ 

    def __init__(self, handle): 

        self._handle = handle 

        self._offset = 0 

 

    def read(self, length): 

        data = self._handle.read(length) 

        self._offset += len(data) 

        return data 

 

    def tell(self): 

        return self._offset 

 

    def seek(self, offset): 

        if offset < self._offset: 

            raise RunTimeError("Can't seek backwards") 

        self._handle.read(offset - self._offset) 

 

    def close(self): 

        return self._handle.close() 

 

 

#This is a generator function! 

def SffIterator(handle, alphabet=Alphabet.generic_dna, trim=False): 

    """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects). 

 

    handle - input file, an SFF file, e.g. from Roche 454 sequencing. 

             This must NOT be opened in universal read lines mode! 

    alphabet - optional alphabet, defaults to generic DNA. 

    trim - should the sequences be trimmed? 

 

    The resulting SeqRecord objects should match those from a paired FASTA 

    and QUAL file converted from the SFF file using the Roche 454 tool 

    ssfinfo. i.e. The sequence will be mixed case, with the trim regions 

    shown in lower case. 

 

    This function is used internally via the Bio.SeqIO functions: 

 

    >>> from Bio import SeqIO 

    >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"): 

    ...     print("%s %i" % (record.id, len(record))) 

    ... 

    E3MFGYR02JWQ7T 265 

    E3MFGYR02JA6IL 271 

    E3MFGYR02JHD4H 310 

    E3MFGYR02GFKUC 299 

    E3MFGYR02FTGED 281 

    E3MFGYR02FR9G7 261 

    E3MFGYR02GAZMS 278 

    E3MFGYR02HHZ8O 221 

    E3MFGYR02GPGB1 269 

    E3MFGYR02F7Z7G 219 

 

    You can also call it directly: 

 

    >>> with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle: 

    ...     for record in SffIterator(handle): 

    ...         print("%s %i" % (record.id, len(record))) 

    ... 

    E3MFGYR02JWQ7T 265 

    E3MFGYR02JA6IL 271 

    E3MFGYR02JHD4H 310 

    E3MFGYR02GFKUC 299 

    E3MFGYR02FTGED 281 

    E3MFGYR02FR9G7 261 

    E3MFGYR02GAZMS 278 

    E3MFGYR02HHZ8O 221 

    E3MFGYR02GPGB1 269 

    E3MFGYR02F7Z7G 219 

 

    Or, with the trim option: 

 

    >>> with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle: 

    ...     for record in SffIterator(handle, trim=True): 

    ...         print("%s %i" % (record.id, len(record))) 

    ... 

    E3MFGYR02JWQ7T 260 

    E3MFGYR02JA6IL 265 

    E3MFGYR02JHD4H 292 

    E3MFGYR02GFKUC 295 

    E3MFGYR02FTGED 277 

    E3MFGYR02FR9G7 256 

    E3MFGYR02GAZMS 271 

    E3MFGYR02HHZ8O 150 

    E3MFGYR02GPGB1 221 

    E3MFGYR02F7Z7G 130 

 

    """ 

    if isinstance(Alphabet._get_base_alphabet(alphabet), 

                  Alphabet.ProteinAlphabet): 

        raise ValueError("Invalid alphabet, SFF files do not hold proteins.") 

    if isinstance(Alphabet._get_base_alphabet(alphabet), 

                  Alphabet.RNAAlphabet): 

        raise ValueError("Invalid alphabet, SFF files do not hold RNA.") 

    try: 

        assert 0 == handle.tell(), "Not at start of file, offset %i" % handle.tell() 

    except AttributeError: 

        #Probably a network handle or something like that 

        handle = _AddTellHandle(handle) 

    header_length, index_offset, index_length, number_of_reads, \ 

        number_of_flows_per_read, flow_chars, key_sequence \ 

        = _sff_file_header(handle) 

    #Now on to the reads... 

    #the read header format (fixed part): 

    #read_header_length     H 

    #name_length            H 

    #seq_len                I 

    #clip_qual_left         H 

    #clip_qual_right        H 

    #clip_adapter_left      H 

    #clip_adapter_right     H 

    #[rest of read header depends on the name length etc] 

    read_header_fmt = '>2HI4H' 

    read_header_size = struct.calcsize(read_header_fmt) 

    read_flow_fmt = ">%iH" % number_of_flows_per_read 

    read_flow_size = struct.calcsize(read_flow_fmt) 

    assert 1 == struct.calcsize(">B") 

    assert 1 == struct.calcsize(">s") 

    assert 1 == struct.calcsize(">c") 

    assert read_header_size % 8 == 0  # Important for padding calc later! 

    #The spec allows for the index block to be before or even in the middle 

    #of the reads. We can check that if we keep track of our position 

    #in the file... 

    for read in range(number_of_reads): 

        if index_offset and handle.tell() == index_offset: 

            offset = index_offset + index_length 

            if offset % 8: 

                offset += 8 - (offset % 8) 

            assert offset % 8 == 0 

            handle.seek(offset) 

            #Now that we've done this, we don't need to do it again. Clear 

            #the index_offset so we can skip extra handle.tell() calls: 

            index_offset = 0 

        yield _sff_read_seq_record(handle, 

                                   number_of_flows_per_read, 

                                   flow_chars, 

                                   key_sequence, 

                                   alphabet, 

                                   trim) 

    _check_eof(handle, index_offset, index_length) 

 

 

def _check_eof(handle, index_offset, index_length): 

    """Check final padding is OK (8 byte alignment) and file ends (PRIVATE). 

 

    Will attempt to spot apparent SFF file concatenation and give an error. 

 

    Will not attempt to seek, only moves the handle forward. 

    """ 

    offset = handle.tell() 

    extra = b"" 

    padding = 0 

 

    if index_offset and offset <= index_offset: 

        # Index block then end of file... 

        if offset < index_offset: 

            raise ValueError("Gap of %i bytes after final record end %i, " 

                             "before %i where index starts?" 

                             % (index_offset - offset, offset, index_offset)) 

        # Doing read to jump the index rather than a seek 

        # in case this is a network handle or similar  

        handle.read(index_offset + index_length - offset) 

        offset = index_offset + index_length 

        assert offset == handle.tell(), \ 

            "Wanted %i, got %i, index is %i to %i" \ 

            % (offset, handle.tell(), index_offset, index_offset + index_length) 

 

    if offset % 8: 

        padding = 8 - (offset % 8) 

        extra = handle.read(padding) 

 

    if padding >= 4 and extra[-4:] == _sff: 

        #Seen this in one user supplied file, should have been 

        #four bytes of null padding but was actually .sff and 

        #the start of a new concatenated SFF file! 

        raise ValueError("Your SFF file is invalid, post index %i byte " 

                         "null padding region ended '.sff' which could " 

                         "be the start of a concatenated SFF file? " 

                         "See offset %i" % (padding, offset)) 

    if padding and not extra: 

        #TODO - Is this error harmless enough to just ignore? 

        import warnings 

        from Bio import BiopythonParserWarning 

        warnings.warn("Your SFF file is technically invalid as it is missing " 

                      "a terminal %i byte null padding region." % padding, 

                      BiopythonParserWarning) 

        return 

    if extra.count(_null) != padding: 

        import warnings 

        from Bio import BiopythonParserWarning 

        warnings.warn("Your SFF file is invalid, post index %i byte " 

                      "null padding region contained data: %r" 

                      % (padding, extra), BiopythonParserWarning) 

 

    offset = handle.tell() 

    assert offset % 8 == 0, \ 

        "Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8) 

    # Should now be at the end of the file... 

    extra = handle.read(4) 

    if extra == _sff: 

        raise ValueError("Additional data at end of SFF file, " 

                         "perhaps multiple SFF files concatenated? " 

                         "See offset %i" % offset) 

    elif extra: 

        raise ValueError("Additional data at end of SFF file, " 

                         "see offset %i" % offset) 

 

 

#This is a generator function! 

def _SffTrimIterator(handle, alphabet=Alphabet.generic_dna): 

    """Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE).""" 

    return SffIterator(handle, alphabet, trim=True) 

 

 

class SffWriter(SequenceWriter): 

    """SFF file writer.""" 

 

    def __init__(self, handle, index=True, xml=None): 

        """Creates the writer object. 

 

        handle - Output handle, ideally in binary write mode. 

        index - Boolean argument, should we try and write an index? 

        xml - Optional string argument, xml manifest to be recorded in the index 

              block (see function ReadRocheXmlManifest for reading this data). 

        """ 

        if hasattr(handle, "mode") and "U" in handle.mode.upper(): 

            raise ValueError("SFF files must NOT be opened in universal new " 

                             "lines mode. Binary mode is required") 

        elif hasattr(handle, "mode") and "B" not in handle.mode.upper(): 

            raise ValueError("SFF files must be opened in binary mode") 

        self.handle = handle 

        self._xml = xml 

        if index: 

            self._index = [] 

        else: 

            self._index = None 

 

    def write_file(self, records): 

        """Use this to write an entire file containing the given records.""" 

        try: 

            self._number_of_reads = len(records) 

        except TypeError: 

            self._number_of_reads = 0  # dummy value 

            if not hasattr(self.handle, "seek") \ 

                    or not hasattr(self.handle, "tell"): 

                raise ValueError("A handle with a seek/tell methods is " 

                                 "required in order to record the total " 

                                 "record count in the file header (once it " 

                                 "is known at the end).") 

        if self._index is not None and \ 

                not (hasattr(self.handle, "seek") and hasattr(self.handle, "tell")): 

            import warnings 

            warnings.warn("A handle with a seek/tell methods is required in " 

                          "order to record an SFF index.") 

            self._index = None 

        self._index_start = 0 

        self._index_length = 0 

        if not hasattr(records, "next"): 

            records = iter(records) 

        #Get the first record in order to find the flow information 

        #we will need for the header. 

        try: 

            record = next(records) 

        except StopIteration: 

            record = None 

        if record is None: 

            #No records -> empty SFF file (or an error)? 

            #We can't write a header without the flow information. 

            #return 0 

            raise ValueError("Must have at least one sequence") 

        try: 

            self._key_sequence = _as_bytes(record.annotations["flow_key"]) 

            self._flow_chars = _as_bytes(record.annotations["flow_chars"]) 

            self._number_of_flows_per_read = len(self._flow_chars) 

        except KeyError: 

            raise ValueError("Missing SFF flow information") 

        self.write_header() 

        self.write_record(record) 

        count = 1 

        for record in records: 

            self.write_record(record) 

            count += 1 

        if self._number_of_reads == 0: 

            #Must go back and record the record count... 

            offset = self.handle.tell() 

            self.handle.seek(0) 

            self._number_of_reads = count 

            self.write_header() 

            self.handle.seek(offset)  # not essential? 

        else: 

            assert count == self._number_of_reads 

        if self._index is not None: 

            self._write_index() 

        return count 

 

    def _write_index(self): 

        assert len(self._index) == self._number_of_reads 

        handle = self.handle 

        self._index.sort() 

        self._index_start = handle.tell()  # need for header 

        #XML... 

        if self._xml is not None: 

            xml = _as_bytes(self._xml) 

        else: 

            from Bio import __version__ 

            xml = "<!-- This file was output with Biopython %s -->\n" % __version__ 

            xml += "<!-- This XML and index block attempts to mimic Roche SFF files -->\n" 

            xml += "<!-- This file may be a combination of multiple SFF files etc -->\n" 

            xml = _as_bytes(xml) 

        xml_len = len(xml) 

        #Write to the file... 

        fmt = ">I4BLL" 

        fmt_size = struct.calcsize(fmt) 

        handle.write(_null * fmt_size + xml)  # fill this later 

        fmt2 = ">6B" 

        assert 6 == struct.calcsize(fmt2) 

        self._index.sort() 

        index_len = 0  # don't know yet! 

        for name, offset in self._index: 

            #Roche files record the offsets using base 255 not 256. 

            #See comments for parsing the index block. There may be a faster 

            #way to code this, but we can't easily use shifts due to odd base 

            off3 = offset 

            off0 = off3 % 255 

            off3 -= off0 

            off1 = off3 % 65025 

            off3 -= off1 

            off2 = off3 % 16581375 

            off3 -= off2 

            assert offset == off0 + off1 + off2 + off3, \ 

                "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3) 

            off3, off2, off1, off0 = off3 // 16581375, off2 // 65025, \ 

                off1 // 255, off0 

            assert off0 < 255 and off1 < 255 and off2 < 255 and off3 < 255, \ 

                "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3) 

            handle.write(name + struct.pack(fmt2, 0, 

                                            off3, off2, off1, off0, 255)) 

            index_len += len(name) + 6 

        #Note any padding in not included: 

        self._index_length = fmt_size + xml_len + index_len  # need for header 

        #Pad out to an 8 byte boundary (although I have noticed some 

        #real Roche SFF files neglect to do this depsite their manual 

        #suggesting this padding should be there): 

        if self._index_length % 8: 

            padding = 8 - (self._index_length % 8) 

            handle.write(_null * padding) 

        else: 

            padding = 0 

        offset = handle.tell() 

        assert offset == self._index_start + self._index_length + padding, \ 

            "%i vs %i + %i + %i" % (offset, self._index_start, 

                                    self._index_length, padding) 

        #Must now go back and update the index header with index size... 

        handle.seek(self._index_start) 

        handle.write(struct.pack(fmt, 778921588,  # magic number 

                                 49, 46, 48, 48,  # Roche index version, "1.00" 

                                 xml_len, index_len) + xml) 

        #Must now go back and update the header... 

        handle.seek(0) 

        self.write_header() 

        handle.seek(offset)  # not essential? 

 

    def write_header(self): 

        #Do header... 

        key_length = len(self._key_sequence) 

        #file header (part one) 

        #use big endiean encdoing   > 

        #magic_number               I 

        #version                    4B 

        #index_offset               Q 

        #index_length               I 

        #number_of_reads            I 

        #header_length              H 

        #key_length                 H 

        #number_of_flows_per_read   H 

        #flowgram_format_code       B 

        #[rest of file header depends on the number of flows and how many keys] 

        fmt = '>I4BQIIHHHB%is%is' % ( 

            self._number_of_flows_per_read, key_length) 

        #According to the spec, the header_length field should be the total 

        #number of bytes required by this set of header fields, and should be 

        #equal to "31 + number_of_flows_per_read + key_length" rounded up to 

        #the next value divisible by 8. 

        if struct.calcsize(fmt) % 8 == 0: 

            padding = 0 

        else: 

            padding = 8 - (struct.calcsize(fmt) % 8) 

        header_length = struct.calcsize(fmt) + padding 

        assert header_length % 8 == 0 

        header = struct.pack(fmt, 779314790,  # magic number 0x2E736666 

                             0, 0, 0, 1,  # version 

                             self._index_start, self._index_length, 

                             self._number_of_reads, 

                             header_length, key_length, 

                             self._number_of_flows_per_read, 

                             1,  # the only flowgram format code we support 

                             self._flow_chars, self._key_sequence) 

        self.handle.write(header + _null * padding) 

 

    def write_record(self, record): 

        """Write a single additional record to the output file. 

 

        This assumes the header has been done. 

        """ 

        #Basics 

        name = _as_bytes(record.id) 

        name_len = len(name) 

        seq = _as_bytes(str(record.seq).upper()) 

        seq_len = len(seq) 

        #Qualities 

        try: 

            quals = record.letter_annotations["phred_quality"] 

        except KeyError: 

            raise ValueError("Missing PHRED qualities information for %s" % record.id) 

        #Flow 

        try: 

            flow_values = record.annotations["flow_values"] 

            flow_index = record.annotations["flow_index"] 

            if self._key_sequence != _as_bytes(record.annotations["flow_key"]) \ 

                    or self._flow_chars != _as_bytes(record.annotations["flow_chars"]): 

                raise ValueError("Records have inconsistent SFF flow data") 

        except KeyError: 

            raise ValueError("Missing SFF flow information for %s" % record.id) 

        except AttributeError: 

            raise ValueError("Header not written yet?") 

        #Clipping 

        try: 

            clip_qual_left = record.annotations["clip_qual_left"] 

            if clip_qual_left < 0: 

                raise ValueError("Negative SFF clip_qual_left value for %s" % record.id) 

            if clip_qual_left: 

                clip_qual_left += 1 

            clip_qual_right = record.annotations["clip_qual_right"] 

            if clip_qual_right < 0: 

                raise ValueError("Negative SFF clip_qual_right value for %s" % record.id) 

            clip_adapter_left = record.annotations["clip_adapter_left"] 

            if clip_adapter_left < 0: 

                raise ValueError("Negative SFF clip_adapter_left value for %s" % record.id) 

            if clip_adapter_left: 

                clip_adapter_left += 1 

            clip_adapter_right = record.annotations["clip_adapter_right"] 

            if clip_adapter_right < 0: 

                raise ValueError("Negative SFF clip_adapter_right value for %s" % record.id) 

        except KeyError: 

            raise ValueError("Missing SFF clipping information for %s" % record.id) 

 

        #Capture information for index 

        if self._index is not None: 

            offset = self.handle.tell() 

            #Check the position of the final record (before sort by name) 

            #Using a four-digit base 255 number, so the upper bound is 

            #254*(1)+254*(255)+254*(255**2)+254*(255**3) = 4228250624 

            #or equivalently it overflows at 255**4 = 4228250625 

            if offset > 4228250624: 

                import warnings 

                warnings.warn("Read %s has file offset %i, which is too large " 

                              "to store in the Roche SFF index structure. No " 

                              "index block will be recorded." % (name, offset)) 

                #No point recoring the offsets now 

                self._index = None 

            else: 

                self._index.append((name, self.handle.tell())) 

 

        #the read header format (fixed part): 

        #read_header_length     H 

        #name_length            H 

        #seq_len                I 

        #clip_qual_left         H 

        #clip_qual_right        H 

        #clip_adapter_left      H 

        #clip_adapter_right     H 

        #[rest of read header depends on the name length etc] 

        #name 

        #flow values 

        #flow index 

        #sequence 

        #padding 

        read_header_fmt = '>2HI4H%is' % name_len 

        if struct.calcsize(read_header_fmt) % 8 == 0: 

            padding = 0 

        else: 

            padding = 8 - (struct.calcsize(read_header_fmt) % 8) 

        read_header_length = struct.calcsize(read_header_fmt) + padding 

        assert read_header_length % 8 == 0 

        data = struct.pack(read_header_fmt, 

                           read_header_length, 

                           name_len, seq_len, 

                           clip_qual_left, clip_qual_right, 

                           clip_adapter_left, clip_adapter_right, 

                           name) + _null * padding 

        assert len(data) == read_header_length 

        #now the flowgram values, flowgram index, bases and qualities 

        #NOTE - assuming flowgram_format==1, which means struct type H 

        read_flow_fmt = ">%iH" % self._number_of_flows_per_read 

        read_flow_size = struct.calcsize(read_flow_fmt) 

        temp_fmt = ">%iB" % seq_len  # used for flow index and quals 

        data += struct.pack(read_flow_fmt, *flow_values) \ 

            + struct.pack(temp_fmt, *flow_index) \ 

            + seq \ 

            + struct.pack(temp_fmt, *quals) 

        #now any final padding... 

        padding = (read_flow_size + seq_len * 3) % 8 

        if padding: 

            padding = 8 - padding 

        self.handle.write(data + _null * padding) 

 

 

if __name__ == "__main__": 

    print("Running quick self test") 

    filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff" 

    with open(filename, "rb") as handle: 

        metadata = ReadRocheXmlManifest(handle) 

    with open(filename, "rb") as handle: 

        index1 = sorted(_sff_read_roche_index(handle)) 

    with open(filename, "rb") as handle: 

        index2 = sorted(_sff_do_slow_index(handle)) 

    assert index1 == index2 

    with open(filename, "rb") as handle: 

        assert len(index1) == len(list(SffIterator(handle))) 

    from Bio._py3k import StringIO 

    from io import BytesIO 

    with open(filename, "rb") as handle: 

        assert len(index1) == len(list(SffIterator(BytesIO(handle.read())))) 

 

    if sys.platform != "win32" and sys.version_info[0] < 3: 

        #Can be lazy and treat as binary... 

        with open(filename, "r") as handle: 

            assert len(index1) == len(list(SffIterator(handle))) 

        with open(filename) as handle: 

            index2 = sorted(_sff_read_roche_index(handle)) 

        assert index1 == index2 

        with open(filename, "r") as handle: 

            index2 = sorted(_sff_do_slow_index(handle)) 

        assert index1 == index2 

        with open(filename, "r") as handle: 

            assert len(index1) == len(list(SffIterator(handle))) 

        with open(filename, "r") as handle: 

            assert len(index1) == len(list(SffIterator(BytesIO(handle.read())))) 

 

    with open(filename, "rb") as handle: 

        sff = list(SffIterator(handle)) 

 

    with open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb") as handle: 

        sff2 = list(SffIterator(handle)) 

    assert len(sff) == len(sff2) 

    for old, new in zip(sff, sff2): 

        assert old.id == new.id 

        assert str(old.seq) == str(new.seq) 

 

    with open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb") as handle: 

        sff2 = list(SffIterator(handle)) 

    assert len(sff) == len(sff2) 

    for old, new in zip(sff, sff2): 

        assert old.id == new.id 

        assert str(old.seq) == str(new.seq) 

 

    with open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb") as handle: 

        sff2 = list(SffIterator(handle)) 

    assert len(sff) == len(sff2) 

    for old, new in zip(sff, sff2): 

        assert old.id == new.id 

        assert str(old.seq) == str(new.seq) 

 

    with open("../../Tests/Roche/E3MFGYR02_index_at_start.sff", "rb") as handle: 

        sff2 = list(SffIterator(handle)) 

    assert len(sff) == len(sff2) 

    for old, new in zip(sff, sff2): 

        assert old.id == new.id 

        assert str(old.seq) == str(new.seq) 

 

    with open("../../Tests/Roche/E3MFGYR02_index_in_middle.sff", "rb") as handle: 

        sff2 = list(SffIterator(handle)) 

    assert len(sff) == len(sff2) 

    for old, new in zip(sff, sff2): 

        assert old.id == new.id 

        assert str(old.seq) == str(new.seq) 

 

    with open(filename, "rb") as handle: 

        sff_trim = list(SffIterator(handle, trim=True)) 

 

    with open(filename, "rb") as handle: 

        print(ReadRocheXmlManifest(handle)) 

 

    from Bio import SeqIO 

    filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.fasta" 

    fasta_no_trim = list(SeqIO.parse(filename, "fasta")) 

    filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.qual" 

    qual_no_trim = list(SeqIO.parse(filename, "qual")) 

 

    filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.fasta" 

    fasta_trim = list(SeqIO.parse(filename, "fasta")) 

    filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.qual" 

    qual_trim = list(SeqIO.parse(filename, "qual")) 

 

    for s, sT, f, q, fT, qT in zip(sff, sff_trim, fasta_no_trim, 

                                   qual_no_trim, fasta_trim, qual_trim): 

        #print("") 

        print(s.id) 

        #print(s.seq) 

        #print(s.letter_annotations["phred_quality"]) 

 

        assert s.id == f.id == q.id 

        assert str(s.seq) == str(f.seq) 

        assert s.letter_annotations[ 

            "phred_quality"] == q.letter_annotations["phred_quality"] 

 

        assert s.id == sT.id == fT.id == qT.id 

        assert str(sT.seq) == str(fT.seq) 

        assert sT.letter_annotations[ 

            "phred_quality"] == qT.letter_annotations["phred_quality"] 

 

    print("Writing with a list of SeqRecords...") 

    handle = BytesIO() 

    w = SffWriter(handle, xml=metadata) 

    w.write_file(sff)  # list 

    data = handle.getvalue() 

    print("And again with an iterator...") 

    handle = BytesIO() 

    w = SffWriter(handle, xml=metadata) 

    w.write_file(iter(sff)) 

    assert data == handle.getvalue() 

    #Check 100% identical to the original: 

    filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff" 

    with open(filename, "rb") as handle: 

        original = handle.read() 

        assert len(data) == len(original) 

        assert data == original 

        del data 

 

    print("-" * 50) 

    filename = "../../Tests/Roche/greek.sff" 

    with open(filename, "rb") as handle: 

        for record in SffIterator(handle): 

            print(record.id) 

    with open(filename, "rb") as handle: 

        index1 = sorted(_sff_read_roche_index(handle)) 

    with open(filename, "rb") as handle: 

        index2 = sorted(_sff_do_slow_index(handle)) 

    assert index1 == index2 

    try: 

        with open(filename, "rb") as handle: 

            print(ReadRocheXmlManifest(handle)) 

        assert False, "Should fail!" 

    except ValueError: 

        pass 

 

    with open(filename, "rb") as handle: 

        for record in SffIterator(handle): 

            pass 

        try: 

            for record in SffIterator(handle): 

                print(record.id) 

            assert False, "Should have failed" 

        except ValueError as err: 

            print("Checking what happens on re-reading a handle:") 

            print(err) 

 

    """ 

    #Ugly code to make test files... 

    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0" 

    padding = len(index)%8 

    if padding: 

        padding = 8 - padding 

    index += chr(0)*padding 

    assert len(index)%8 == 0 

 

    #Ugly bit of code to make a fake index at start 

    records = list(SffIterator( 

        open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb"))) 

    out_handle = open( 

        "../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "w") 

    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0" 

    padding = len(index)%8 

    if padding: 

        padding = 8 - padding 

    index += chr(0)*padding 

    w = SffWriter(out_handle, index=False, xml=None) 

    #Fake the header... 

    w._number_of_reads = len(records) 

    w._index_start = 0 

    w._index_length = 0 

    w._key_sequence = records[0].annotations["flow_key"] 

    w._flow_chars = records[0].annotations["flow_chars"] 

    w._number_of_flows_per_read = len(w._flow_chars) 

    w.write_header() 

    w._index_start = out_handle.tell() 

    w._index_length = len(index) 

    out_handle.seek(0) 

    w.write_header() #this time with index info 

    w.handle.write(index) 

    for record in records: 

        w.write_record(record) 

    out_handle.close() 

    records2 = list(SffIterator( 

        open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb"))) 

    for old, new in zip(records, records2): 

        assert str(old.seq)==str(new.seq) 

    i = list(_sff_do_slow_index( 

        open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb"))) 

 

    #Ugly bit of code to make a fake index in middle 

    records = list(SffIterator( 

        open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb"))) 

    out_handle = open( 

        "../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "w") 

    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0" 

    padding = len(index)%8 

    if padding: 

        padding = 8 - padding 

    index += chr(0)*padding 

    w = SffWriter(out_handle, index=False, xml=None) 

    #Fake the header... 

    w._number_of_reads = len(records) 

    w._index_start = 0 

    w._index_length = 0 

    w._key_sequence = records[0].annotations["flow_key"] 

    w._flow_chars = records[0].annotations["flow_chars"] 

    w._number_of_flows_per_read = len(w._flow_chars) 

    w.write_header() 

    for record in records[:5]: 

        w.write_record(record) 

    w._index_start = out_handle.tell() 

    w._index_length = len(index) 

    w.handle.write(index) 

    for record in records[5:]: 

        w.write_record(record) 

    out_handle.seek(0) 

    w.write_header() #this time with index info 

    out_handle.close() 

    records2 = list(SffIterator( 

        open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb"))) 

    for old, new in zip(records, records2): 

        assert str(old.seq)==str(new.seq) 

    j = list(_sff_do_slow_index( 

        open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb"))) 

 

    #Ugly bit of code to make a fake index at end 

    records = list(SffIterator( 

        open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb"))) 

    with open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "w") as out_handle: 

        w = SffWriter(out_handle, index=False, xml=None) 

        #Fake the header... 

        w._number_of_reads = len(records) 

        w._index_start = 0 

        w._index_length = 0 

        w._key_sequence = records[0].annotations["flow_key"] 

        w._flow_chars = records[0].annotations["flow_chars"] 

        w._number_of_flows_per_read = len(w._flow_chars) 

        w.write_header() 

        for record in records: 

            w.write_record(record) 

        w._index_start = out_handle.tell() 

        w._index_length = len(index) 

        out_handle.write(index) 

        out_handle.seek(0) 

        w.write_header() #this time with index info 

    records2 = list(SffIterator( 

        open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb"))) 

    for old, new in zip(records, records2): 

        assert str(old.seq)==str(new.seq) 

    try: 

        print(ReadRocheXmlManifest( 

            open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb"))) 

        assert False, "Should fail!" 

    except ValueError: 

        pass 

    k = list(_sff_do_slow_index( 

        open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb"))) 

    """ 

 

    print("Done")