Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

# Copyright 2008-2010 by Peter Cock.  All rights reserved. 

# 

# This code is part of the Biopython distribution and governed by its 

# license.  Please see the LICENSE file that should have been included 

# as part of this package. 

 

"""Bio.SeqIO support for the "ace" file format. 

 

You are expected to use this module via the Bio.SeqIO functions. 

See also the Bio.Sequencing.Ace module which offers more than just accessing 

the contig consensus sequences in an ACE file as SeqRecord objects. 

""" 

 

from __future__ import print_function 

 

from Bio.Seq import Seq 

from Bio.SeqRecord import SeqRecord 

from Bio.Alphabet import generic_nucleotide, generic_dna, generic_rna, Gapped 

from Bio.Sequencing import Ace 

 

 

def AceIterator(handle): 

    """Returns SeqRecord objects from an ACE file. 

 

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that 

    by iterating over the file in a single pass, we are forced to ignore any 

    WA, CT, RT or WR footer tags. 

 

    Ace files include the base quality for each position, which are taken 

    to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file 

    using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's 

    letter_annotations dictionary under the "phred_quality" key. 

 

    >>> from Bio import SeqIO 

    >>> with open("Ace/consed_sample.ace", "rU") as handle: 

    ...     for record in SeqIO.parse(handle, "ace"): 

    ...         print("%s %s... %i" % (record.id, record.seq[:10], len(record))) 

    ...         print(max(record.letter_annotations["phred_quality"])) 

    Contig1 agccccgggc... 1475 

    90 

 

    However, ACE files do not include a base quality for any gaps in the 

    consensus sequence, and these are represented in Biopython with a quality 

    of zero. Using zero is perhaps misleading as there may be very strong 

    evidence to support the gap in the consensus. Previous versions of 

    Biopython therefore used None instead, but this complicated usage, and 

    prevented output of the gapped sequence as FASTQ format. 

 

    >>> from Bio import SeqIO 

    >>> with open("Ace/contig1.ace", "rU") as handle: 

    ...     for record in SeqIO.parse(handle, "ace"): 

    ...         print("%s ...%s..." % (record.id, record.seq[85:95])) 

    ...         print(record.letter_annotations["phred_quality"][85:95]) 

    ...         print(max(record.letter_annotations["phred_quality"])) 

    Contig1 ...AGAGG-ATGC... 

    [57, 57, 54, 57, 57, 0, 57, 72, 72, 72] 

    90 

    Contig2 ...GAATTACTAT... 

    [68, 68, 68, 68, 68, 68, 68, 68, 68, 68] 

    90 

 

    """ 

    for ace_contig in Ace.parse(handle): 

        #Convert the ACE contig record into a SeqRecord... 

        consensus_seq_str = ace_contig.sequence 

        #Assume its DNA unless there is a U in it, 

        if "U" in consensus_seq_str: 

            if "T" in consensus_seq_str: 

                #Very odd! Error? 

                alpha = generic_nucleotide 

            else: 

                alpha = generic_rna 

        else: 

            alpha = generic_dna 

 

        if "*" in consensus_seq_str: 

            #For consistency with most other file formats, map 

            #any * gaps into - gaps. 

            assert "-" not in consensus_seq_str 

            consensus_seq = Seq(consensus_seq_str.replace("*", "-"), 

                                Gapped(alpha, gap_char="-")) 

        else: 

            consensus_seq = Seq(consensus_seq_str, alpha) 

 

        #TODO? - Base segments (BS lines) which indicates which read 

        #phrap has chosen to be the consensus at a particular position. 

        #Perhaps as SeqFeature objects? 

 

        #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) 

        #Perhaps as SeqFeature objects? 

 

        seq_record = SeqRecord(consensus_seq, 

                               id=ace_contig.name, 

                               name=ace_contig.name) 

 

        #Consensus base quality (BQ lines).  Note that any gaps (originally 

        #as * characters) in the consensus do not get a quality entry, so 

        #we assign a quality of None (zero would be missleading as there may 

        #be excelent support for having a gap here). 

        quals = [] 

        i = 0 

        for base in consensus_seq: 

            if base == "-": 

                quals.append(0) 

            else: 

                quals.append(ace_contig.quality[i]) 

                i += 1 

        assert i == len(ace_contig.quality) 

        seq_record.letter_annotations["phred_quality"] = quals 

 

        yield seq_record 

    #All done 

 

 

if __name__ == "__main__": 

    from Bio._utils import run_doctest 

    run_doctest()