Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

# Copyright 2010 by Thomas Schmitt. 

# All rights reserved. 

# 

# This module is for reading and writing SeqXML format files as 

# SeqRecord objects, and is expected to be used via the Bio.SeqIO API. 

"""Bio.SeqIO support for the "seqxml" file format, SeqXML. 

 

You are expected to use this module via the Bio.SeqIO functions. 

 

SeqXML is a lightweight XML format which is supposed be an alternative for 

FASTA files. For more Information see http://www.seqXML.org and Schmitt et al 

(2011), http://dx.doi.org/10.1093/bib/bbr025 

""" 

 

from __future__ import print_function 

 

from xml.sax.saxutils import XMLGenerator 

from xml.sax.xmlreader import AttributesImpl 

from xml.dom import pulldom 

from xml.sax import SAXParseException 

 

from Bio._py3k import range 

from Bio._py3k import basestring 

 

from Bio import Alphabet 

from Bio.Seq import Seq 

from Bio.Seq import UnknownSeq 

from Bio.SeqRecord import SeqRecord 

from .Interfaces import SequentialSequenceWriter 

 

 

class XMLRecordIterator: 

    """Base class for building iterators for record style XML formats. 

 

    It is assumed that all information for one record can be found within a 

    record element or above. Two types of methods are called when the start 

    tag of an element is reached. To receive only the attributes of an 

    element before its end tag is reached implement _attr_TAGNAME. 

    To get an element and its children as a DOM tree implement _elem_TAGNAME. 

    Everything that is part of the DOM tree will not trigger any further 

    method calls. 

    """ 

 

    def __init__(self, handle, recordTag, namespace=None): 

        """Creating the object and initializing the XML parser.""" 

 

        self._recordTag = recordTag 

        self._namespace = namespace 

        self._events = pulldom.parse(handle) 

 

    def __iter__(self): 

        """Iterate over the records in the XML file. 

        Returns the last parsed record.""" 

 

        record = None 

        try: 

            for event, node in self._events: 

 

                if event == "START_ELEMENT" and node.namespaceURI == self._namespace: 

 

                    if node.localName == self._recordTag: 

                        #create an empty SeqRecord 

                        record = SeqRecord('', id='') 

 

                    #call matching methods with attributes only 

                    if hasattr(self, "_attr_" + node.localName): 

                        getattr(self, "_attr_" + node.localName)( 

                            self._attributes(node), record) 

 

                    #call matching methods with DOM tree 

                    if hasattr(self, "_elem_" + node.localName): 

                        #read the element and all nested elements into a DOM tree 

                        self._events.expandNode(node) 

                        node.normalize() 

 

                        getattr(self, "_elem_" + node.localName)(node, record) 

 

                elif event == "END_ELEMENT" and node.namespaceURI == self._namespace and node.localName == self._recordTag: 

                    yield record 

 

        except SAXParseException as e: 

 

            if e.getLineNumber() == 1 and e.getColumnNumber() == 0: 

                #empty file 

                pass 

            else: 

                import os 

                if e.getLineNumber() == 1 and e.getColumnNumber() == 1 \ 

                        and os.name == "java": 

                    #empty file, see http://bugs.jython.org/issue1774 

                    pass 

                else: 

                    raise 

 

    def _attributes(self, node): 

        """Return the attributes of a DOM node as dictionary.""" 

 

        return dict((node.attributes.item(i).name, node.attributes.item(i).value) 

                    for i in range(node.attributes.length)) 

 

 

class SeqXmlIterator(XMLRecordIterator): 

    """Breaks seqXML file into SeqRecords. 

 

    Assumes valid seqXML please validate beforehand.""" 

 

    def __init__(self, handle): 

        """Create the object.""" 

        XMLRecordIterator.__init__(self, handle, "entry") 

 

        self._source = None 

        self._source_version = None 

        self._version = None 

        self._speciesName = None 

        self._ncbiTaxId = None 

 

    def _attr_seqXML(self, attr_dict, record): 

        """Parse the document metadata.""" 

 

        if "source" in attr_dict: 

            self._source = attr_dict["source"] 

        if "sourceVersion" in attr_dict: 

            self._source_version = attr_dict["sourceVersion"] 

        if "version" in attr_dict: 

            self._version = attr_dict["seqXMLversion"] 

        if "ncbiTaxID" in attr_dict: 

            self._ncbiTaxId = attr_dict["ncbiTaxID"] 

        if "speciesName" in attr_dict: 

            self._speciesName = attr_dict["speciesName"] 

 

    def _attr_property(self, attr_dict, record): 

        """Parse key value pair properties and store them as annotations.""" 

 

        if "name" not in attr_dict: 

            raise ValueError("Malformed property element.") 

 

        value = attr_dict.get("value", None) 

 

        if attr_dict["name"] not in record.annotations: 

            record.annotations[attr_dict["name"]] = value 

        elif isinstance(record.annotations[attr_dict["name"]], list): 

            record.annotations[attr_dict["name"]].append(value) 

        else: 

            record.annotations[attr_dict["name"]] = [ 

                record.annotations[attr_dict["name"]], value] 

 

    def _attr_species(self, attr_dict, record): 

        """Parse the species information.""" 

 

        if "name" not in attr_dict or "ncbiTaxID" not in attr_dict: 

            raise ValueError("Malformed species element!") 

 

        #the keywords for the species annotation are taken from SwissIO 

        record.annotations["organism"] = attr_dict["name"] 

        #TODO - Should have been a list to match SwissProt parser: 

        record.annotations["ncbi_taxid"] = attr_dict["ncbiTaxID"] 

 

    def _attr_entry(self, attr_dict, record): 

        """New entry set id and the optional entry source.""" 

 

        if "id" not in attr_dict: 

            raise ValueError("Malformed entry! Identifier is missing.") 

 

        record.id = attr_dict["id"] 

        if "source" in attr_dict: 

            record.annotations["source"] = attr_dict["source"] 

        elif self._source is not None: 

            record.annotations["source"] = self._source 

 

        #initialize entry with global species definition 

        #the keywords for the species annotation are taken from SwissIO 

        if self._ncbiTaxId is not None: 

            record.annotations["ncbi_taxid"] = self._ncbiTaxId 

        if self._speciesName is not None: 

            record.annotations["organism"] = self._speciesName 

 

    def _elem_DNAseq(self, node, record): 

        """Parse DNA sequence.""" 

 

        if not (node.hasChildNodes() and len(node.firstChild.data) > 0): 

            raise ValueError("Sequence length should be greater than 0.") 

 

        record.seq = Seq(node.firstChild.data, Alphabet.generic_dna) 

 

    def _elem_RNAseq(self, node, record): 

        """Parse RNA sequence.""" 

 

        if not (node.hasChildNodes() and len(node.firstChild.data) > 0): 

            raise ValueError("Sequence length should be greater than 0.") 

 

        record.seq = Seq(node.firstChild.data, Alphabet.generic_rna) 

 

    def _elem_AAseq(self, node, record): 

        """Parse protein sequence.""" 

 

        if not (node.hasChildNodes() and len(node.firstChild.data) > 0): 

            raise ValueError("Sequence length should be greater than 0.") 

 

        record.seq = Seq(node.firstChild.data, Alphabet.generic_protein) 

 

    def _elem_description(self, node, record): 

        """Parse the description.""" 

 

        if node.hasChildNodes() and len(node.firstChild.data) > 0: 

            record.description = node.firstChild.data 

 

    def _attr_DBRef(self, attr_dict, record): 

        """Parse a database cross reference""" 

 

        if "source" not in attr_dict or "id" not in attr_dict: 

            raise ValueError("Invalid DB cross reference.") 

 

        if "%s:%s" % (attr_dict["source"], attr_dict["id"]) not in record.dbxrefs: 

            record.dbxrefs.append( 

                "%s:%s" % (attr_dict["source"], attr_dict["id"])) 

 

 

class SeqXmlWriter(SequentialSequenceWriter): 

    """Writes SeqRecords into seqXML file. 

 

    SeqXML requires the sequence alphabet be explicitly RNA, DNA or protein, 

    i.e. an instance or subclass of Bio.Alphapet.RNAAlphabet, 

    Bio.Alphapet.DNAAlphabet or Bio.Alphapet.ProteinAlphabet. 

    """ 

 

    def __init__(self, handle, source=None, source_version=None, 

                 species=None, ncbiTaxId=None): 

        """Create Object and start the xml generator.""" 

 

        SequentialSequenceWriter.__init__(self, handle) 

 

        self.xml_generator = XMLGenerator(handle, "utf-8") 

        self.xml_generator.startDocument() 

        self.source = source 

        self.source_version = source_version 

        self.species = species 

        self.ncbiTaxId = ncbiTaxId 

 

    def write_header(self): 

        """Write root node with document metadata.""" 

        SequentialSequenceWriter.write_header(self) 

 

        attrs = {"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", 

                 "xsi:noNamespaceSchemaLocation": "http://www.seqxml.org/0.4/seqxml.xsd", 

                 "seqXMLversion": "0.4"} 

 

        if self.source is not None: 

            attrs["source"] = self.source 

        if self.source_version is not None: 

            attrs["sourceVersion"] = self.source_version 

        if self.species is not None: 

            if not isinstance(self.species, basestring): 

                raise TypeError("species should be of type string") 

            attrs["speciesName"] = self.species 

        if self.ncbiTaxId is not None: 

            if not isinstance(self.ncbiTaxId, (basestring, int)): 

                raise TypeError("ncbiTaxID should be of type string or int") 

            attrs["ncbiTaxID"] = self.ncbiTaxId 

 

        self.xml_generator.startElement("seqXML", AttributesImpl(attrs)) 

 

    def write_record(self, record): 

        """Write one record.""" 

 

        if not record.id or record.id == "<unknown id>": 

            raise ValueError("SeqXML requires identifier") 

 

        if not isinstance(record.id, basestring): 

            raise TypeError("Identifier should be of type string") 

 

        attrb = {"id": record.id} 

 

        if "source" in record.annotations and self.source != record.annotations["source"]: 

            if not isinstance(record.annotations["source"], basestring): 

                raise TypeError("source should be of type string") 

            attrb["source"] = record.annotations["source"] 

 

        self.xml_generator.startElement("entry", AttributesImpl(attrb)) 

        self._write_species(record) 

        self._write_description(record) 

        self._write_seq(record) 

        self._write_dbxrefs(record) 

        self._write_properties(record) 

        self.xml_generator.endElement("entry") 

 

    def write_footer(self): 

        """Close the root node and finish the XML document.""" 

 

        SequentialSequenceWriter.write_footer(self) 

 

        self.xml_generator.endElement("seqXML") 

        self.xml_generator.endDocument() 

 

    def _write_species(self, record): 

        """Write the species if given.""" 

 

        local_ncbi_taxid = None 

        if "ncbi_taxid" in record.annotations: 

            local_ncbi_taxid = record.annotations["ncbi_taxid"] 

            if isinstance(local_ncbi_taxid, list): 

                #SwissProt parser uses a list (which could cope with chimeras) 

                if len(local_ncbi_taxid) == 1: 

                    local_ncbi_taxid = local_ncbi_taxid[0] 

                elif len(local_ncbi_taxid) == 0: 

                    local_ncbi_taxid = None 

                else: 

                    ValueError('Multiple entries for record.annotations["ncbi_taxid"], %r' 

                                     % local_ncbi_taxid) 

        if "organism" in record.annotations and local_ncbi_taxid: 

            local_org = record.annotations["organism"] 

 

            if not isinstance(local_org, basestring): 

                raise TypeError("organism should be of type string") 

 

            if not isinstance(local_ncbi_taxid, (basestring, int)): 

                raise TypeError("ncbiTaxID should be of type string or int") 

 

            #The local species definition is only written if it differs from the global species definition 

            if local_org != self.species or local_ncbi_taxid != self.ncbiTaxId: 

 

                attr = {"name": local_org, 

                        "ncbiTaxID": local_ncbi_taxid} 

                self.xml_generator.startElement( 

                    "species", AttributesImpl(attr)) 

                self.xml_generator.endElement("species") 

 

    def _write_description(self, record): 

        """Write the description if given.""" 

 

        if record.description: 

 

            if not isinstance(record.description, basestring): 

                raise TypeError("Description should be of type string") 

 

            description = record.description 

            if description == "<unknown description>": 

                description = "" 

 

            if len(record.description) > 0: 

                self.xml_generator.startElement( 

                    "description", AttributesImpl({})) 

                self.xml_generator.characters(description) 

                self.xml_generator.endElement("description") 

 

    def _write_seq(self, record): 

        """Write the sequence. 

 

        Note that SeqXML requires a DNA, RNA or protein alphabet. 

        """ 

 

        if isinstance(record.seq, UnknownSeq): 

            raise TypeError( 

                "Sequence type is UnknownSeq but SeqXML requires sequence") 

 

        seq = str(record.seq) 

 

        if not len(seq) > 0: 

            raise ValueError("The sequence length should be greater than 0") 

 

        #Get the base alphabet (underneath any Gapped or StopCodon encoding) 

        alpha = Alphabet._get_base_alphabet(record.seq.alphabet) 

        if isinstance(alpha, Alphabet.RNAAlphabet): 

            seqElem = "RNAseq" 

        elif isinstance(alpha, Alphabet.DNAAlphabet): 

            seqElem = "DNAseq" 

        elif isinstance(alpha, Alphabet.ProteinAlphabet): 

            seqElem = "AAseq" 

        else: 

            raise ValueError("Need a DNA, RNA or Protein alphabet") 

 

        self.xml_generator.startElement(seqElem, AttributesImpl({})) 

        self.xml_generator.characters(seq) 

        self.xml_generator.endElement(seqElem) 

 

    def _write_dbxrefs(self, record): 

        """Write all database cross references.""" 

        if record.dbxrefs is not None: 

 

            for dbxref in record.dbxrefs: 

 

                if not isinstance(dbxref, basestring): 

                    raise TypeError("dbxrefs should be of type list of string") 

                if dbxref.find(':') < 1: 

                    raise ValueError("dbxrefs should be in the form ['source:id', 'source:id' ]") 

 

                dbsource, dbid = dbxref.split(':', 1) 

 

                attr = {"source": dbsource, "id": dbid} 

                self.xml_generator.startElement("DBRef", AttributesImpl(attr)) 

                self.xml_generator.endElement("DBRef") 

 

    def _write_properties(self, record): 

        """Write all annotations that are key value pairs with values of a primitive type or list of primitive types.""" 

 

        for key, value in record.annotations.items(): 

 

            if key not in ("organism", "ncbi_taxid", "source"): 

 

                if value is None: 

 

                    attr = {"name": key} 

                    self.xml_generator.startElement( 

                        "property", AttributesImpl(attr)) 

                    self.xml_generator.endElement("property") 

 

                elif isinstance(value, list): 

 

                    for v in value: 

                        if isinstance(value, (int, float, basestring)): 

                            attr = {"name": key, "value": v} 

                            self.xml_generator.startElement( 

                                "property", AttributesImpl(attr)) 

                            self.xml_generator.endElement("property") 

 

                elif isinstance(value, (int, float, basestring)): 

 

                    attr = {"name": key, "value": str(value)} 

                    self.xml_generator.startElement( 

                        "property", AttributesImpl(attr)) 

                    self.xml_generator.endElement("property") 

 

if __name__ == "__main__": 

    print("Running quick self test") 

 

    from Bio import SeqIO 

    import sys 

 

    with open("Tests/SeqXML/protein_example.xml", "r") as fileHandle: 

        records = list(SeqIO.parse(fileHandle, "seqxml")) 

 

    from Bio._py3k import StringIO 

    stringHandle = StringIO() 

 

    SeqIO.write(records, stringHandle, "seqxml") 

    SeqIO.write(records, sys.stdout, "seqxml") 

    print("") 

 

    stringHandle.seek(0) 

    records = list(SeqIO.parse(stringHandle, "seqxml")) 

 

    SeqIO.write(records, sys.stdout, "seqxml") 

    print("")