Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

# Copyright 2000-2001 by Andrew Dalke. 

# Revisions copyright 2008 by Peter Cock. 

# All rights reserved. 

# This code is part of the Biopython distribution and governed by its 

# license.  Please see the LICENSE file that should have been included 

# as part of this package. 

 

"""Standard nucleotide and protein alphabets defined by IUPAC.""" 

 

from Bio import Alphabet 

from Bio.Data import IUPACData 

 

##################### Protein 

 

# From the IUPAC definition at: 

#   http://www.chem.qmw.ac.uk/iupac/AminoAcid/A2021.html#AA21 

 

assert IUPACData.extended_protein_letters == IUPACData.extended_protein_letters.upper() 

 

 

class ExtendedIUPACProtein(Alphabet.ProteinAlphabet): 

    """Extended uppercase IUPAC protein single letter alphabet including X etc. 

 

    In addition to the standard 20 single letter protein codes, this includes: 

 

    B = "Asx";  Aspartic acid (R) or Asparagine (N) 

    X = "Xxx";  Unknown or 'other' amino acid 

    Z = "Glx";  Glutamic acid (E) or Glutamine (Q) 

    J = "Xle";  Leucine (L) or Isoleucine (I), used in mass-spec (NMR) 

    U = "Sec";  Selenocysteine 

    O = "Pyl";  Pyrrolysine 

 

    This alphabet is not intended to be used with X for Selenocysteine 

    (an ad-hoc standard prior to the IUPAC adoption of U instead). 

    """ 

    letters = IUPACData.extended_protein_letters 

 

extended_protein = ExtendedIUPACProtein() 

 

assert IUPACData.protein_letters == IUPACData.protein_letters.upper() 

 

 

class IUPACProtein(ExtendedIUPACProtein): 

    """Uppercase IUPAC protein single letter alphabet of the 20 standard amino acids.""" 

    letters = IUPACData.protein_letters 

 

protein = IUPACProtein() 

 

##################### DNA 

 

 

# The next two are the IUPAC definitions, from: 

#   http://www.chem.qmw.ac.uk/iubmb/misc/naseq.html 

class IUPACAmbiguousDNA(Alphabet.DNAAlphabet): 

    """Uppercase IUPAC ambiguous DNA.""" 

    letters = IUPACData.ambiguous_dna_letters 

 

ambiguous_dna = IUPACAmbiguousDNA() 

 

 

class IUPACUnambiguousDNA(IUPACAmbiguousDNA): 

    """Uppercase IUPAC unambiguous DNA (letters GATC only).""" 

    letters = IUPACData.unambiguous_dna_letters 

 

unambiguous_dna = IUPACUnambiguousDNA() 

 

 

# Also from the URL, but not part of the standard 

class ExtendedIUPACDNA(Alphabet.DNAAlphabet): 

    """Extended IUPAC DNA alphabet. 

 

    In addition to the standard letter codes GATC, this includes: 

 

    B = 5-bromouridine 

    D = 5,6-dihydrouridine 

    S = thiouridine 

    W = wyosine 

    """ 

    letters = IUPACData.extended_dna_letters 

 

extended_dna = ExtendedIUPACDNA() 

 

##################### RNA 

 

 

class IUPACAmbiguousRNA(Alphabet.RNAAlphabet): 

    """Uppercase IUPAC ambiguous RNA.""" 

    letters = IUPACData.ambiguous_rna_letters 

 

ambiguous_rna = IUPACAmbiguousRNA() 

 

 

class IUPACUnambiguousRNA(IUPACAmbiguousRNA): 

    """Uppercase IUPAC unambiguous RNA (letters GAUC only).""" 

    letters = IUPACData.unambiguous_rna_letters 

 

unambiguous_rna = IUPACUnambiguousRNA() 

 

# are there extended forms? 

#class ExtendedIUPACRNA(Alphabet.RNAAlphabet): 

#    letters = extended_rna_letters 

#    #   B == 5-bromouridine 

#    #   D == 5,6-dihydrouridine 

#    #   S == thiouridine 

#    #   W == wyosine