Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/name.py: 96%

104 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-12-25 12:03 +1100

1from __future__ import annotations 

2 

3import re 

4import typing 

5 

6from dataclasses import dataclass 

7 

8from .species import Species 

9 

10 

11_release = re.compile(r"\d+") 

12 

13 

14def get_version_from_name(name): 

15 """returns the release and build identifiers from an ensembl db_name""" 

16 r = _release.search(name) 

17 if r is None: 

18 return None, None 

19 

20 # first number run is release, followed by build 

21 # note, for the ensemblgenomes naming system, the second digit run is the 

22 # standard Ensembl release and the first is for the specified genome 

23 release = name[r.start() : r.end()] 

24 b = [s for s in _name_delim.split(name[r.end() :]) if s] 

25 

26 return release, b 

27 

28 

29_name_delim = re.compile("_") 

30 

31 

32def get_dbtype_from_name(name): 

33 """returns the data base type from the name""" 

34 name = _release.split(name) 

35 name = [s for s in _name_delim.split(name[0]) if s] 

36 return name[1] if name[0] == "ensembl" else name[-1] 

37 

38 

39def get_db_prefix(name): 

40 """returns the db prefix, typically an organism or `ensembl'""" 

41 name = _release.split(name) 

42 name = [s for s in _name_delim.split(name[0]) if s] 

43 if name[0] == "ensembl": 

44 prefix = "ensembl" 

45 elif len(name) > 2: 

46 prefix = "_".join(name[:-1]) 

47 else: 

48 raise ValueError(f"Unknown name structure: {'_'.join(name)}") 

49 return prefix 

50 

51 

52class EnsemblDbName: 

53 """container for a db name, inferring different attributes from the name, 

54 such as species, version, build""" 

55 

56 def __init__(self, db_name): 

57 """db_name: and Emsembl database name""" 

58 self.name = db_name 

59 self.type = get_dbtype_from_name(db_name) 

60 self.prefix = get_db_prefix(db_name) 

61 

62 release, build = get_version_from_name(db_name) 

63 self.release = release 

64 self.general_release = self.release 

65 

66 self.build = None 

67 if build and len(build) == 1: 

68 if self.type != "compara": 

69 self.build = build[0] 

70 else: 

71 self.general_release = build[0] 

72 elif build: 

73 self.build = build[1] 

74 self.general_release = build[0] 

75 

76 self.species = Species.get_species_name(self.prefix) 

77 

78 def __repr__(self): 

79 build = f"; build='{self.build}'" if self.build is not None else "" 

80 return f"db(prefix='{self.prefix}'; type='{self.type}'; release='{self.release}'{build})" 

81 

82 def __str__(self): 

83 return self.name 

84 

85 def __lt__(self, other): 

86 if isinstance(other, type(self)): 

87 other = other.name 

88 return self.name < other 

89 

90 def __eq__(self, other): 

91 if isinstance(other, type(self)): 

92 other = other.name 

93 return self.name == other 

94 

95 def __ne__(self, other): 

96 if isinstance(other, type(self)): 

97 other = other.name 

98 return self.name != other 

99 

100 def __hash__(self): 

101 return hash(self.name) 

102 

103 

104@dataclass 

105class EmfName: 

106 """stores information from EMF SEQ records""" 

107 

108 species: str 

109 coord_name: str 

110 start: int 

111 end: int 

112 strand: str 

113 coord_length: str 

114 

115 def __post_init__(self): 

116 # adjust the lengths to be ints and put into python coord 

117 self.start = int(self.start) - 1 

118 self.end = int(self.end) 

119 

120 def __str__(self): 

121 attrs = "species", "coord_name", "start", "end", "strand" 

122 n = [str(getattr(self, attr)) for attr in attrs] 

123 return ":".join(n) 

124 

125 def __hash__(self): 

126 return hash(str(self)) 

127 

128 def to_dict(self) -> dict: 

129 attrs = "species", "coord_name", "start", "end", "strand" 

130 return {attr: getattr(self, attr) for attr in attrs} 

131 

132 

133@dataclass 

134class MafName: 

135 """stores source information from Maf records""" 

136 

137 species: str 

138 coord_name: str 

139 start: int 

140 end: int 

141 strand: str 

142 coord_length: typing.Optional[str | int] 

143 

144 def __post_init__(self): 

145 # adjust the lengths to be ints and put into python coord 

146 self.start = int(self.start) - 1 

147 self.end = int(self.end) 

148 self.coord_length = int(self.coord_length) if self.coord_length else None 

149 

150 def __str__(self): 

151 attrs = "species", "coord_name", "start", "end", "strand" 

152 n = [str(getattr(self, attr)) for attr in attrs] 

153 return ":".join(n) 

154 

155 def __hash__(self): 

156 return hash(str(self)) 

157 

158 def to_dict(self) -> dict: 

159 attrs = "species", "coord_name", "start", "end", "strand" 

160 return {attr: getattr(self, attr) for attr in attrs}