Coverage for test_emapper.py: 99%

70 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-03-21 09:19 +0100

1 

2import sys 

3import os 

4import tarfile 

5from io import StringIO, BytesIO 

6import unittest 

7sys.path.insert(0, os.path.abspath(os.path.dirname(__file__) + '/..')) 

8 

9#from collections import namedtuple 

10from tempfile import NamedTemporaryFile, TemporaryDirectory 

11 

12from ete4 import Tree 

13from treeprofiler import tree_annotate 

14from treeprofiler.src import utils 

15 

16class TestTreeAnnotation(unittest.TestCase): 

17 def test_emapper(self): 

18 # test eggnogmapper annotation 

19 internal_parser = "name" 

20 parser = utils.get_internal_parser(internal_parser) 

21 

22 test_tree = utils.ete4_parse("(A:1,(B:1,(E:1,D:1)Internal_1:0.5)Internal_2:0.5)Root;") 

23 

24 # load emapper annotations 

25 with NamedTemporaryFile(suffix='.out.emapper.annotations') as f_annotation: 

26 emapper_text = '## Tue Jun 6 10:36:47 2023\n## emapper-2.1.9\n## /data/shared/home/emapper/miniconda3/envs/eggnog-mapper-2.1/bin/emapper.py --cpu 20 --mp_start_method forkserver --data_dir /dev/shm/ -o out --output_dir /emapper_web_jobs/emapper_jobs/user_data/MM_8bdu7zy0 --temp_dir /emapper_web_jobs/emapper_jobs/user_data/MM_8bdu7zy0 --override -m diamond --dmnd_ignore_warnings --dmnd_algo ctg -i /emapper_web_jobs/emapper_jobs/user_data/MM_8bdu7zy0/queries.fasta --evalue 0.001 --score 60 --pident 40 --query_cover 20 --subject_cover 20 --itype proteins --tax_scope auto --target_orthologs all --go_evidence non-electronic --pfam_realign denovo --num_servers 2 --report_orthologs --decorate_gff yes --excel\n##\n' 

27 emapper_text += '#query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs\n' 

28 emapper_text += 'A 1000565.METUNv1_03972 4.99e-223 614.0 COG1348@1|root,COG1348@2|Bacteria,1MVTE@1224|Proteobacteria,2VIK4@28216|Betaproteobacteria,2KUME@206389|Rhodocyclales 206389|Rhodocyclales P The key enzymatic reactions in nitrogen fixation are catalyzed by the nitrogenase complex which has 2 components the iron protein and the molybdenum-iron protein nifH - 1.18.6.1 ko:K02588 ko00625,ko00910,ko01100,ko01120,map00625,map00910,map01100,map01120 M00175 R05185,R05496 RC00002,RC01395,RC02891 ko00000,ko00001,ko00002,ko01000 - - - Fer4_NifH\n' 

29 emapper_text += 'B 765911.Thivi_3647 2.76e-190 530.0 COG1348@1|root,COG1348@2|Bacteria,1MVTE@1224|Proteobacteria,1RR82@1236|Gammaproteobacteria,1WW4V@135613|Chromatiales 135613|Chromatiales P The key enzymatic reactions in nitrogen fixation are catalyzed by the nitrogenase complex, which has 2 components the iron protein and the molybdenum-iron protein nifH - 1.18.6.1 ko:K02588 ko00625,ko00910,ko01100,ko01120,map00625,map00910,map01100,map01120 M00175 R05185,R05496 RC00002,RC01395,RC02891 ko00000,ko00001,ko00002,ko01000 - - - Fer4_NifH\n' 

30 emapper_text += 'E 1009370.ALO_07448 8.77e-204 564.0 COG1348@1|root,COG1348@2|Bacteria,1TPXR@1239|Firmicutes,4H3VB@909932|Negativicutes 909932|Negativicutes P The key enzymatic reactions in nitrogen fixation are catalyzed by the nitrogenase complex, which has 2 components the iron protein and the molybdenum-iron protein - - 1.18.6.1 ko:K02588 ko00625,ko00910,ko01100,ko01120,map00625,map00910,map01100,map01120 M00175 R05185,R05496 RC00002,RC01395,RC02891 ko00000,ko00001,ko00002,ko01000 - - - Fer4_NifH\n' 

31 emapper_text += 'D 1009370.ALO_17011 6.09e-173 483.0 COG1348@1|root,COG1348@2|Bacteria,1TPXR@1239|Firmicutes,4H2BN@909932|Negativicutes 909932|Negativicutes P Belongs to the NifH BchL ChlL family - - 1.18.6.1 ko:K02588 ko00625,ko00910,ko01100,ko01120,map00625,map00910,map01100,map01120 M00175 R05185,R05496 RC00002,RC01395,RC02891 ko00000,ko00001,ko00002,ko01000 - - - Fer4_NifH\n' 

32 emapper_text += '## 4 queries scanned\n## Total time (seconds): 216.7490794658661\n## Rate: 9.91 q/s\n' 

33 f_annotation.write(emapper_text.encode()) 

34 f_annotation.flush() 

35 

36 #metadata_dict, node_props, columns, prop2type = tree_annotate.parse_csv([f_annotation.name]) 

37 test_tree_annotated, annotated_prop2type = tree_annotate.run_tree_annotate(test_tree, 

38 emapper_annotations=f_annotation.name, 

39 ) 

40 #print(test_tree_annotated.write(props=None, format_root_node=True)) 

41 

42 expected_tree = Tree('(A:1[&&NHX:BRITE=ko00000|ko00001|ko00002|ko01000:BiGG_Reaction=NaN:CAZy=NaN:COG_category=P:Description=The key enzymatic reactions in nitrogen fixation are catalyzed by the nitrogenase complex which has 2 components the iron protein and the molybdenum-iron protein:EC=1.18.6.1:GOs=NaN:KEGG_Module=M00175:KEGG_Pathway=ko00625|ko00910|ko01100|ko01120|map00625|map00910|map01100|map01120:KEGG_Reaction=R05185|R05496:KEGG_TC=NaN:KEGG_ko=ko_K02588:KEGG_rclass=RC00002|RC01395|RC02891:PFAMs=Fer4_NifH:Preferred_name=nifH:eggNOG_OGs=COG1348@1|root|COG1348@2|Bacteria|1MVTE@1224|Proteobacteria|2VIK4@28216|Betaproteobacteria|2KUME@206389|Rhodocyclales:evalue=4.99e-223:max_annot_lvl=206389|Rhodocyclales:score=614.0:seed_ortholog=1000565.METUNv1_03972],(B:1[&&NHX:BRITE=ko00000|ko00001|ko00002|ko01000:BiGG_Reaction=NaN:CAZy=NaN:COG_category=P:Description=The key enzymatic reactions in nitrogen fixation are catalyzed by the nitrogenase complex_ which has 2 components the iron protein and the molybdenum-iron protein:EC=1.18.6.1:GOs=NaN:KEGG_Module=M00175:KEGG_Pathway=ko00625|ko00910|ko01100|ko01120|map00625|map00910|map01100|map01120:KEGG_Reaction=R05185|R05496:KEGG_TC=NaN:KEGG_ko=ko_K02588:KEGG_rclass=RC00002|RC01395|RC02891:PFAMs=Fer4_NifH:Preferred_name=nifH:eggNOG_OGs=COG1348@1|root|COG1348@2|Bacteria|1MVTE@1224|Proteobacteria|1RR82@1236|Gammaproteobacteria|1WW4V@135613|Chromatiales:evalue=2.76e-190:max_annot_lvl=135613|Chromatiales:score=530.0:seed_ortholog=765911.Thivi_3647],(E:1[&&NHX:BRITE=ko00000|ko00001|ko00002|ko01000:BiGG_Reaction=NaN:CAZy=NaN:COG_category=P:Description=The key enzymatic reactions in nitrogen fixation are catalyzed by the nitrogenase complex_ which has 2 components the iron protein and the molybdenum-iron protein:EC=1.18.6.1:GOs=NaN:KEGG_Module=M00175:KEGG_Pathway=ko00625|ko00910|ko01100|ko01120|map00625|map00910|map01100|map01120:KEGG_Reaction=R05185|R05496:KEGG_TC=NaN:KEGG_ko=ko_K02588:KEGG_rclass=RC00002|RC01395|RC02891:PFAMs=Fer4_NifH:Preferred_name=NaN:eggNOG_OGs=COG1348@1|root|COG1348@2|Bacteria|1TPXR@1239|Firmicutes|4H3VB@909932|Negativicutes:evalue=8.77e-204:max_annot_lvl=909932|Negativicutes:score=564.0:seed_ortholog=1009370.ALO_07448],D:1[&&NHX:BRITE=ko00000|ko00001|ko00002|ko01000:BiGG_Reaction=NaN:CAZy=NaN:COG_category=P:Description=Belongs to the NifH BchL ChlL family:EC=1.18.6.1:GOs=NaN:KEGG_Module=M00175:KEGG_Pathway=ko00625|ko00910|ko01100|ko01120|map00625|map00910|map01100|map01120:KEGG_Reaction=R05185|R05496:KEGG_TC=NaN:KEGG_ko=ko_K02588:KEGG_rclass=RC00002|RC01395|RC02891:PFAMs=Fer4_NifH:Preferred_name=NaN:eggNOG_OGs=COG1348@1|root|COG1348@2|Bacteria|1TPXR@1239|Firmicutes|4H2BN@909932|Negativicutes:evalue=6.09e-173:max_annot_lvl=909932|Negativicutes:score=483.0:seed_ortholog=1009370.ALO_17011])Internal_1:0.5[&&NHX:BRITE_counter=ko00000--2||ko00001--2||ko00002--2||ko01000--2:BiGG_Reaction_counter=NaN--2:CAZy_counter=NaN--2:COG_category_counter=P--2:Description_counter=Belongs to the NifH BchL ChlL family--1||The key enzymatic reactions in nitrogen fixation are catalyzed by the nitrogenase complex_ which has 2 components the iron protein and the molybdenum-iron protein--1:EC_counter=1.18.6.1--2:GOs_counter=NaN--2:KEGG_Module_counter=M00175--2:KEGG_Pathway_counter=ko00625--2||ko00910--2||ko01100--2||ko01120--2||map00625--2||map00910--2||map01100--2||map01120--2:KEGG_Reaction_counter=R05185--2||R05496--2:KEGG_TC_counter=NaN--2:KEGG_ko_counter=ko_K02588--2:KEGG_rclass_counter=RC00002--2||RC01395--2||RC02891--2:PFAMs_counter=Fer4_NifH--2:Preferred_name_counter=NaN--2:eggNOG_OGs_counter=1TPXR@1239|Firmicutes--2||4H2BN@909932|Negativicutes--1||4H3VB@909932|Negativicutes--1||COG1348@1|root--2||COG1348@2|Bacteria--2:evalue_avg=3.045e-173:evalue_max=6.09e-173:evalue_min=8.77e-204:evalue_std=0.0:evalue_sum=6.09e-173:max_annot_lvl_counter=909932|Negativicutes--2:score_avg=523.5:score_max=564.0:score_min=483.0:score_std=3280.5:score_sum=1047.0:seed_ortholog_counter=1009370.ALO_07448--1||1009370.ALO_17011--1])Internal_2:0.5[&&NHX:BRITE_counter=ko00000--3||ko00001--3||ko00002--3||ko01000--3:BiGG_Reaction_counter=NaN--3:CAZy_counter=NaN--3:COG_category_counter=P--3:Description_counter=Belongs to the NifH BchL ChlL family--1||The key enzymatic reactions in nitrogen fixation are catalyzed by the nitrogenase complex_ which has 2 components the iron protein and the molybdenum-iron protein--2:EC_counter=1.18.6.1--3:GOs_counter=NaN--3:KEGG_Module_counter=M00175--3:KEGG_Pathway_counter=ko00625--3||ko00910--3||ko01100--3||ko01120--3||map00625--3||map00910--3||map01100--3||map01120--3:KEGG_Reaction_counter=R05185--3||R05496--3:KEGG_TC_counter=NaN--3:KEGG_ko_counter=ko_K02588--3:KEGG_rclass_counter=RC00002--3||RC01395--3||RC02891--3:PFAMs_counter=Fer4_NifH--3:Preferred_name_counter=NaN--2||nifH--1:eggNOG_OGs_counter=1MVTE@1224|Proteobacteria--1||1RR82@1236|Gammaproteobacteria--1||1TPXR@1239|Firmicutes--2||1WW4V@135613|Chromatiales--1||4H2BN@909932|Negativicutes--1||4H3VB@909932|Negativicutes--1||COG1348@1|root--3||COG1348@2|Bacteria--3:evalue_avg=2.03e-173:evalue_max=6.09e-173:evalue_min=8.77e-204:evalue_std=0.0:evalue_sum=6.09e-173:max_annot_lvl_counter=135613|Chromatiales--1||909932|Negativicutes--2:score_avg=525.6666666666666:score_max=564.0:score_min=483.0:score_std=1654.3333333333333:score_sum=1577.0:seed_ortholog_counter=1009370.ALO_07448--1||1009370.ALO_17011--1||765911.Thivi_3647--1])Root:0[&&NHX:BRITE_counter=ko00000--4||ko00001--4||ko00002--4||ko01000--4:BiGG_Reaction_counter=NaN--4:CAZy_counter=NaN--4:COG_category_counter=P--4:Description_counter=Belongs to the NifH BchL ChlL family--1||The key enzymatic reactions in nitrogen fixation are catalyzed by the nitrogenase complex which has 2 components the iron protein and the molybdenum-iron protein--1||The key enzymatic reactions in nitrogen fixation are catalyzed by the nitrogenase complex_ which has 2 components the iron protein and the molybdenum-iron protein--2:EC_counter=1.18.6.1--4:GOs_counter=NaN--4:KEGG_Module_counter=M00175--4:KEGG_Pathway_counter=ko00625--4||ko00910--4||ko01100--4||ko01120--4||map00625--4||map00910--4||map01100--4||map01120--4:KEGG_Reaction_counter=R05185--4||R05496--4:KEGG_TC_counter=NaN--4:KEGG_ko_counter=ko_K02588--4:KEGG_rclass_counter=RC00002--4||RC01395--4||RC02891--4:PFAMs_counter=Fer4_NifH--4:Preferred_name_counter=NaN--2||nifH--2:eggNOG_OGs_counter=1MVTE@1224|Proteobacteria--2||1RR82@1236|Gammaproteobacteria--1||1TPXR@1239|Firmicutes--2||1WW4V@135613|Chromatiales--1||2KUME@206389|Rhodocyclales--1||2VIK4@28216|Betaproteobacteria--1||4H2BN@909932|Negativicutes--1||4H3VB@909932|Negativicutes--1||COG1348@1|root--4||COG1348@2|Bacteria--4:evalue_avg=1.5225e-173:evalue_max=6.09e-173:evalue_min=4.99e-223:evalue_std=0.0:evalue_sum=6.09e-173:max_annot_lvl_counter=135613|Chromatiales--1||206389|Rhodocyclales--1||909932|Negativicutes--2:score_avg=547.75:score_max=614.0:score_min=483.0:score_std=3053.5833333333335:score_sum=2191.0:seed_ortholog_counter=1000565.METUNv1_03972--1||1009370.ALO_07448--1||1009370.ALO_17011--1||765911.Thivi_3647--1];', 

43 parser=parser) 

44 for leaf in test_tree_annotated.leaves(): 

45 props = list(leaf.props.keys()) 

46 self.assertEqual(leaf.write(props=props, parser=parser, format_root_node=True), expected_tree[leaf.name].write(props=props, parser=parser, format_root_node=True)) 

47 

48 def test_pfam(self): 

49 # test alignment 

50 # load tree 

51 internal_parser = "name" 

52 parser = utils.get_internal_parser(internal_parser) 

53 test_tree = utils.ete4_parse("(1000565.METUNv1_03972:1,(1007099.SAMN05216287:1,(1121400.SAMN02746065_101305:1,1009370.ALO_07448:1)Internal_1:0.5)Internal_2:0.5)Root;") 

54 

55 # pfam data 

56 with NamedTemporaryFile(suffix='.out.emapper.pfam') as f_pfam: 

57 pfam_text = "## Tue Jun 6 10:36:50 2023\n## emapper-2.1.9\n## /data/shared/home/emapper/miniconda3/envs/eggnog-mapper-2.1/bin/emapper.py --cpu 20 --mp_start_method forkserver --data_dir /dev/shm/ -o out --output_dir /emapper_web_jobs/emapper_jobs/user_data/MM_8bdu7zy0 --temp_dir /emapper_web_jobs/emapper_jobs/user_data/MM_8bdu7zy0 --override -m diamond --dmnd_ignore_warnings --dmnd_algo ctg -i /emapper_web_jobs/emapper_jobs/user_data/MM_8bdu7zy0/queries.fasta --evalue 0.001 --score 60 --pident 40 --query_cover 20 --subject_cover 20 --itype proteins --tax_scope auto --target_orthologs all --go_evidence non-electronic --pfam_realign denovo --num_servers 2 --report_orthologs --decorate_gff yes --excel\n##\n# query_name hit evalue sum_score query_length hmmfrom hmmto seqfrom seqto query_coverage\n" 

58 pfam_text += "1000565.METUNv1_03972 Fer4_NifH 6.3e-85 284.5 308 3 265 45 305 0.8441558441558441\n" 

59 pfam_text += "1007099.SAMN05216287_3993 Fer4_NifH 2.9e-138 459.5 294 1 270 4 274 0.9183673469387755\n" 

60 pfam_text += "1121400.SAMN02746065_101305 Oxidored_nitro 1.8e-70 237.2 734 1 398 321 721 0.5449591280653951\n" 

61 pfam_text += "1121400.SAMN02746065_101305 Fer4_NifH 6.5e-93 310.3 734 1 267 1 266 0.36103542234332425\n" 

62 pfam_text += "1009370.ALO_07448 Fer4_NifH 6.5e-98 327.1 291 1 264 6 268 0.9003436426116839\n" 

63 f_pfam.write(pfam_text.encode()) 

64 f_pfam.flush() 

65 

66 with NamedTemporaryFile(suffix='.fasta') as f_msa: 

67 fasta_text = ">1000565.METUNv1_03972\n\ 

68 ------------------------------------------------------------\n\ 

69 ----------MNTVTTTHVP--LSSLKT-------------R-------R--GTS-----\n\ 

70 QA---------D----------GEGS----VQVHQDPT--LRIGT--AKVFAVYGKGGIG\n\ 

71 KSTTSSNLSVAFSK----L--GKRVLQIGCDPKHDSTFTLTKS-----------------\n\ 

72 --------------------LVPTVID---ILETVD------F----------------H\n\ 

73 S------EELRP-----EDFVFPG--YNG------------VMCVEAGG-PPAGTGCGGY\n\ 

74 VVGQTVKLLKEHHLL--D---E--------T----DVVIFDVLGDVVCGGFAAPLQ--HA\n\ 

75 DRALVVTANDFDSIFAMNRIVAAIQAK--SKNY---KVRLGGVIANR-----SN--AT--\n\ 

76 -----------DQIDRFNERVGLKTMAQFPDLDV-IR--RSRL-KKATLFEM------DP\n\ 

77 TV----E-----VEAVQHEYLRLAASLWAG----A---DPL------ECAP---MK----\n\ 

78 DRDIFDLLGFD-------------------------------------------------\n\ 

79 ------------------------------------------------------------\n\ 

80 ------------------------------------------------------------\n\ 

81 ------------------------------------------------------------\n\ 

82 ------------------------------------------------------------\n\ 

83 ------------------------------------------------------------\n\ 

84 ------------------------------------------------------------\n\ 

85 ------------------------------------------------------------\n\ 

86 ------------------------------------------------------------\n\ 

87 ------------------------------------------------------------\n\ 

88 ------------------------------------------------------------\n\ 

89 ------------------------------------------------------------\n\ 

90 -------------------\n\ 

91 >1007099.SAMN05216287_3993\n\ 

92 ------------------------------------------------------------\n\ 

93 ------------------------------------------------------------\n\ 

94 -------------------------------------------MA--LRQCAIYGKGGIG\n\ 

95 KSTTTQNLVSALAE----A--GQKVMIVGCDPKADSTRLILHA-----------------\n\ 

96 -------------------KAQNSIME---MAAEAG-----------------------S\n\ 

97 V------EDLEL-----EDVLKVG--YRD------------IKCVESGG-PEPGVGCAGR\n\ 

98 GVITAINFLEEEGAYE-E---D--------L----DFVFYDVLGDVVCGGFAMPIRENKA\n\ 

99 QEIYIVCSGEMMAMYAANNIAKGIVKY--ANSG---SVRLAGLICNS--RNTAR--ED--\n\ 

100 -----------ELIMELARQLGTQMIHFVPRDNV-VQ--RAEI-RRMTVVEY--------\n\ 

101 DP----T------AKQADEYRQLANKIVNN--R-----NFV------IPTP---IT----\n\ 

102 MDELESLLMEFGI-----LD-----E------E---------DESII-------------\n\ 

103 -------------------------------GKA------------AHEEAAS-A-----\n\ 

104 ------------------------------------------------------------\n\ 

105 ------------------------------------------------------------\n\ 

106 ------------------------------------------------------------\n\ 

107 ------------------------------------------------------------\n\ 

108 ------------------------------------------------------------\n\ 

109 ------------------------------------------------------------\n\ 

110 ------------------------------------------------------------\n\ 

111 ------------------------------------------------------------\n\ 

112 ------------------------------------------------------------\n\ 

113 ------------------------------------------------------------\n\ 

114 -------------------\n\ 

115 >1121400.SAMN02746065_101305\n\ 

116 ------------------------------------------------------------\n\ 

117 ------------------------------------------------------------\n\ 

118 ------------------------------------------------MKIAIYGKGGIG\n\ 

119 KSTISANLSAALAK----A--GKKVLQIGCDPKHDSTRLLLGG-----------------\n\ 

120 -------------------KRIMTALD---YMKNTP-----------------------V\n\ 

121 G-------LQRL-----DRVLHVG--YKG------------IVCAEAGG-PEPGVGCAGR\n\ 

122 GILSTFALFERLGLD--M---N-T------F----DVVVYDVLGDVVCGGFAVPLRQGFA\n\ 

123 DTVYVVTSEEFMSIYAANNILKGVKNF--DQGG----HRLAGLILNS--RGTHE--NR--\n\ 

124 -----------HPVKRFAQNVKLPVKQTVPRSEL-FR--KAEM-MEKTVVEA--------\n\ 

125 FP----D------STEARAFHDLARDVLEN----H---TFY------PARF---LN----\n\ 

126 EDVLEQLILQDSP-----PQ-----A------Q---------TDAEN-------------\n\ 

127 ---------------RV------------PSKAP------------GINEKSP-EKKFKV\n\ 

128 KSS-------------------------------------DKKSVFLSKSLLTREPLHGC\n\ 

129 AFSGALATTTQIKDTVTVAHGPRSCTNIACQAILS-----------------AGFRLFTR\n\ 

130 KKIL-----LE-------------------------------------------------\n\ 

131 -------NQIAPAVISSDMDESVVIYGGKDNLVKTLEQAME-Q--------NPKAVFLVT\n\ 

132 TCPSGVIGDDPVAAIHEIRQKYPQIPVIAVTSDGNLRGD-YMQGVLNACMEGAGALMDK-\n\ 

133 TVTPKSHCVNILAEKNIAFNAESNFNTIADILKEMNIDI-NCRFVRNTSVEQLKGFLKAP\n\ 

134 LNLPAYTDYFGRLMADFIDERLGIPTAKQPFPVGFSESVAWVREIADFFHES-M-AGER-\n\ 

135 --VIDTHRRHYETMIKTYGHTLKGRRLMILTYMHNVDWIVEAAFDLG----------MEV\n\ 

136 IKVCILNFSQD----------------NLFIT-----RYP--ERFE-VETNYDPAKRDKD\n\ 

137 LERLKP-DLLLGNY--TPKNLPYPLHVDIIPMCPDVGFYGGLAFAHRWATLIKAPVTEGW\n\ 

138 KNDAL--------------\n\ 

139 >1009370.ALO_07448\n\ 

140 ------------------------------------------------------------\n\ 

141 ------------------------------------------------------------\n\ 

142 -----------------------------------------MAKK--IKQIAIYGKGGIG\n\ 

143 KSTTTSNISAALAV----A--GYKVMQVGCDPKSDSTNTLRGG-----------------\n\ 

144 -------------------TYIPTVLD---TLQDRS------------------------\n\ 

145 S--------VKL-----SEIVFEG--FHG------------VYCVEAGG-PAPGVGCAGR\n\ 

146 GIISAVQTLKNLKVY--D---D--------L--DLDIVIYDVLGDVVCGGFAVPIREGIA\n\ 

147 EHVFTVSSADFMAIYAANNLFKGIKKY--SNSR---GALLGGVIANS--ISAPY--AK--\n\ 

148 -----------QIVDDFASRTKTQVVGYVPRSVT-VT--QSEL-QGKTTIEA--------\n\ 

149 FP----D------SPQAQVYKQLAAKIAAH----E---VSA------TPSP---LE----\n\ 

150 IEELRSWAAQWAD-----NL----VA------L---------ETGEV-------------\n\ 

151 -------------------------------RSA------------AQSI----------\n\ 

152 ------------------------------------------------------------\n\ 

153 ------------------------------------------------------------\n\ 

154 ------------------------------------------------------------\n\ 

155 ------------------------------------------------------------\n\ 

156 ------------------------------------------------------------\n\ 

157 ------------------------------------------------------------\n\ 

158 ------------------------------------------------------------\n\ 

159 ------------------------------------------------------------\n\ 

160 ------------------------------------------------------------\n\ 

161 ------------------------------------------------------------\n\ 

162 -------------------\n" 

163 f_msa.write(fasta_text.encode()) 

164 f_msa.flush() 

165 

166 test_tree_annotated, annotated_prop2type = tree_annotate.run_tree_annotate(test_tree, 

167 alignment=f_msa.name, emapper_pfam=f_pfam.name 

168 ) 

169 

170 

171 expected_tree = "(1000565.METUNv1_03972:1[&&NHX:dom_arq=Fer4_NifH@171@608],(1007099.SAMN05216287:1,(1121400.SAMN02746065_101305:1[&&NHX:dom_arq=Oxidored_nitro@780@1312||Fer4_NifH@169@610],1009370.ALO_07448:1[&&NHX:dom_arq=Fer4_NifH@169@607])Internal_1:0.5[&&NHX:dom_arq=Oxidored_nitro@780@1312||Fer4_NifH@169@610])Internal_2:0.5[&&NHX:dom_arq=none@none@none])Root[&&NHX:dom_arq=Fer4_NifH@171@608];" 

172 

173 self.assertEqual(test_tree_annotated.write(props=["dom_arq"], parser=parser, format_root_node=True), expected_tree) 

174 

175 def test_smart(self): 

176 # test alignment 

177 # load tree 

178 internal_parser = "name" 

179 parser = utils.get_internal_parser(internal_parser) 

180 

181 test_tree = utils.ete4_parse("(1000565.METUNv1_03972:1,(1007099.SAMN05216287:1,(1121400.SAMN02746065_101305:1,1009370.ALO_07448:1)Internal_1:0.5)Internal_2:0.5)Root;") 

182 

183 # smart data 

184 with NamedTemporaryFile(suffix='.out.emapper.pfam') as f_smart: 

185 smart_text = "1000565.METUNv1_03972 AAA 41 211 0.524566042059228\n1000565.METUNv1_03972 SRP54 42 262 36307.0836782689\n1000565.METUNv1_03972 ALAD 105 217 85040.1722567821\n1000565.METUNv1_03972 LIM 135 174 1265.27135640085\n1000565.METUNv1_03972 LytTR 150 238 41327.9413786371\n1000565.METUNv1_03972 VHP 188 217 949.65191865172\n1000565.METUNv1_03972 SAF 199 257 80999.5566918776\n" 

186 smart_text += "1007099.SAMN05216287_3993 RAS 1 134 299.463246391791\n1007099.SAMN05216287_3993 AAA 2 130 1.12006109933941\n1007099.SAMN05216287_3993 SRP54 3 171 19746.377786172\n1007099.SAMN05216287_3993 YL1_C 4 34 62142.1549437987\n1007099.SAMN05216287_3993 UDPG_MGDP_dh_C 7 106 120144.095567571\n1007099.SAMN05216287_3993 Dak1_2 8 277 100474.2546051\n1007099.SAMN05216287_3993 HTH_DEOR 55 101 1708.14400541016\n1007099.SAMN05216287_3993 DHHA2 58 173 85384.7641607122\n1007099.SAMN05216287_3993 HhH2 78 113 701.373137135878\n1007099.SAMN05216287_3993 cNMP 105 227 547.330514787662\n1007099.SAMN05216287_3993 MGS 135 210 82904.7219178997\n1007099.SAMN05216287_3993 GATase_5 138 255 159773.057819153\n1007099.SAMN05216287_3993 DUF3585 150 270 30661.2406626041\n1007099.SAMN05216287_3993 Ribosomal_S13_N 156 217 162292.32589339\n1007099.SAMN05216287_3993 Amb_V_allergen 156 190 74760.8280411016\n1007099.SAMN05216287_3993 Malic_M 158 266 77604.2997814085\n1007099.SAMN05216287_3993 ADSL_C 167 246 61745.9686502824\n1007099.SAMN05216287_3993 NGN 177 272 881.485864401489\n" 

187 smart_text += "1121400.SAMN02746065_101305 AAA 2 355 8.86546657676417\n1121400.SAMN02746065_101305 H4 158 226 2188.17758527356\n1121400.SAMN02746065_101305 MeTrc 189 414 894.348474942383\n1121400.SAMN02746065_101305 FIST_C 251 453 171927.833933555\n1121400.SAMN02746065_101305 SET 294 384 1449.49197333925\n1121400.SAMN02746065_101305 GHA 299 385 559.252746615624\n1121400.SAMN02746065_101305 Cadherin_pro 350 413 90154.3519668998\n1121400.SAMN02746065_101305 IMPDH 352 522 52956.4312062364\n1121400.SAMN02746065_101305 MoCF_biosynth 353 477 15920.4774286842\n1121400.SAMN02746065_101305 ALAD 359 504 31454.7850654048\n1121400.SAMN02746065_101305 PBP5_C 362 449 10258.5684918211\n1121400.SAMN02746065_101305 MAPKK1_Int 423 511 108239.883996528\n1121400.SAMN02746065_101305 BRIGHT 543 610 514.785975446317\n" 

188 smart_text += "1009370.ALO_07448 AAA 4 194 16.4901572434334\n1009370.ALO_07448 SRP54 5 202 70300.2139690043\n1009370.ALO_07448 FtsA 22 194 117837.675850668\n1009370.ALO_07448 DHDPS 51 252 101873.191011028\n1009370.ALO_07448 DHHA2 99 252 43393.7684825686\n1009370.ALO_07448 ETF 103 264 43090.4551064702\n1009370.ALO_07448 GATase_5 134 291 96046.6250578487\n1009370.ALO_07448 MyTH4 139 266 525.097143529794\n1009370.ALO_07448 Haem_bd 177 273 102676.956308069\n1009370.ALO_07448 DSRM 193 254 1406.06053620549\n" 

189 f_smart.write(smart_text.encode()) 

190 f_smart.flush() 

191 

192 with NamedTemporaryFile(suffix='.fasta') as f_msa: 

193 fasta_text = ">1000565.METUNv1_03972\n\ 

194 ------------------------------------------------------------\n\ 

195 ----------MNTVTTTHVP--LSSLKT-------------R-------R--GTS-----\n\ 

196 QA---------D----------GEGS----VQVHQDPT--LRIGT--AKVFAVYGKGGIG\n\ 

197 KSTTSSNLSVAFSK----L--GKRVLQIGCDPKHDSTFTLTKS-----------------\n\ 

198 --------------------LVPTVID---ILETVD------F----------------H\n\ 

199 S------EELRP-----EDFVFPG--YNG------------VMCVEAGG-PPAGTGCGGY\n\ 

200 VVGQTVKLLKEHHLL--D---E--------T----DVVIFDVLGDVVCGGFAAPLQ--HA\n\ 

201 DRALVVTANDFDSIFAMNRIVAAIQAK--SKNY---KVRLGGVIANR-----SN--AT--\n\ 

202 -----------DQIDRFNERVGLKTMAQFPDLDV-IR--RSRL-KKATLFEM------DP\n\ 

203 TV----E-----VEAVQHEYLRLAASLWAG----A---DPL------ECAP---MK----\n\ 

204 DRDIFDLLGFD-------------------------------------------------\n\ 

205 ------------------------------------------------------------\n\ 

206 ------------------------------------------------------------\n\ 

207 ------------------------------------------------------------\n\ 

208 ------------------------------------------------------------\n\ 

209 ------------------------------------------------------------\n\ 

210 ------------------------------------------------------------\n\ 

211 ------------------------------------------------------------\n\ 

212 ------------------------------------------------------------\n\ 

213 ------------------------------------------------------------\n\ 

214 ------------------------------------------------------------\n\ 

215 ------------------------------------------------------------\n\ 

216 -------------------\n\ 

217 >1007099.SAMN05216287_3993\n\ 

218 ------------------------------------------------------------\n\ 

219 ------------------------------------------------------------\n\ 

220 -------------------------------------------MA--LRQCAIYGKGGIG\n\ 

221 KSTTTQNLVSALAE----A--GQKVMIVGCDPKADSTRLILHA-----------------\n\ 

222 -------------------KAQNSIME---MAAEAG-----------------------S\n\ 

223 V------EDLEL-----EDVLKVG--YRD------------IKCVESGG-PEPGVGCAGR\n\ 

224 GVITAINFLEEEGAYE-E---D--------L----DFVFYDVLGDVVCGGFAMPIRENKA\n\ 

225 QEIYIVCSGEMMAMYAANNIAKGIVKY--ANSG---SVRLAGLICNS--RNTAR--ED--\n\ 

226 -----------ELIMELARQLGTQMIHFVPRDNV-VQ--RAEI-RRMTVVEY--------\n\ 

227 DP----T------AKQADEYRQLANKIVNN--R-----NFV------IPTP---IT----\n\ 

228 MDELESLLMEFGI-----LD-----E------E---------DESII-------------\n\ 

229 -------------------------------GKA------------AHEEAAS-A-----\n\ 

230 ------------------------------------------------------------\n\ 

231 ------------------------------------------------------------\n\ 

232 ------------------------------------------------------------\n\ 

233 ------------------------------------------------------------\n\ 

234 ------------------------------------------------------------\n\ 

235 ------------------------------------------------------------\n\ 

236 ------------------------------------------------------------\n\ 

237 ------------------------------------------------------------\n\ 

238 ------------------------------------------------------------\n\ 

239 ------------------------------------------------------------\n\ 

240 -------------------\n\ 

241 >1121400.SAMN02746065_101305\n\ 

242 ------------------------------------------------------------\n\ 

243 ------------------------------------------------------------\n\ 

244 ------------------------------------------------MKIAIYGKGGIG\n\ 

245 KSTISANLSAALAK----A--GKKVLQIGCDPKHDSTRLLLGG-----------------\n\ 

246 -------------------KRIMTALD---YMKNTP-----------------------V\n\ 

247 G-------LQRL-----DRVLHVG--YKG------------IVCAEAGG-PEPGVGCAGR\n\ 

248 GILSTFALFERLGLD--M---N-T------F----DVVVYDVLGDVVCGGFAVPLRQGFA\n\ 

249 DTVYVVTSEEFMSIYAANNILKGVKNF--DQGG----HRLAGLILNS--RGTHE--NR--\n\ 

250 -----------HPVKRFAQNVKLPVKQTVPRSEL-FR--KAEM-MEKTVVEA--------\n\ 

251 FP----D------STEARAFHDLARDVLEN----H---TFY------PARF---LN----\n\ 

252 EDVLEQLILQDSP-----PQ-----A------Q---------TDAEN-------------\n\ 

253 ---------------RV------------PSKAP------------GINEKSP-EKKFKV\n\ 

254 KSS-------------------------------------DKKSVFLSKSLLTREPLHGC\n\ 

255 AFSGALATTTQIKDTVTVAHGPRSCTNIACQAILS-----------------AGFRLFTR\n\ 

256 KKIL-----LE-------------------------------------------------\n\ 

257 -------NQIAPAVISSDMDESVVIYGGKDNLVKTLEQAME-Q--------NPKAVFLVT\n\ 

258 TCPSGVIGDDPVAAIHEIRQKYPQIPVIAVTSDGNLRGD-YMQGVLNACMEGAGALMDK-\n\ 

259 TVTPKSHCVNILAEKNIAFNAESNFNTIADILKEMNIDI-NCRFVRNTSVEQLKGFLKAP\n\ 

260 LNLPAYTDYFGRLMADFIDERLGIPTAKQPFPVGFSESVAWVREIADFFHES-M-AGER-\n\ 

261 --VIDTHRRHYETMIKTYGHTLKGRRLMILTYMHNVDWIVEAAFDLG----------MEV\n\ 

262 IKVCILNFSQD----------------NLFIT-----RYP--ERFE-VETNYDPAKRDKD\n\ 

263 LERLKP-DLLLGNY--TPKNLPYPLHVDIIPMCPDVGFYGGLAFAHRWATLIKAPVTEGW\n\ 

264 KNDAL--------------\n\ 

265 >1009370.ALO_07448\n\ 

266 ------------------------------------------------------------\n\ 

267 ------------------------------------------------------------\n\ 

268 -----------------------------------------MAKK--IKQIAIYGKGGIG\n\ 

269 KSTTTSNISAALAV----A--GYKVMQVGCDPKSDSTNTLRGG-----------------\n\ 

270 -------------------TYIPTVLD---TLQDRS------------------------\n\ 

271 S--------VKL-----SEIVFEG--FHG------------VYCVEAGG-PAPGVGCAGR\n\ 

272 GIISAVQTLKNLKVY--D---D--------L--DLDIVIYDVLGDVVCGGFAVPIREGIA\n\ 

273 EHVFTVSSADFMAIYAANNLFKGIKKY--SNSR---GALLGGVIANS--ISAPY--AK--\n\ 

274 -----------QIVDDFASRTKTQVVGYVPRSVT-VT--QSEL-QGKTTIEA--------\n\ 

275 FP----D------SPQAQVYKQLAAKIAAH----E---VSA------TPSP---LE----\n\ 

276 IEELRSWAAQWAD-----NL----VA------L---------ETGEV-------------\n\ 

277 -------------------------------RSA------------AQSI----------\n\ 

278 ------------------------------------------------------------\n\ 

279 ------------------------------------------------------------\n\ 

280 ------------------------------------------------------------\n\ 

281 ------------------------------------------------------------\n\ 

282 ------------------------------------------------------------\n\ 

283 ------------------------------------------------------------\n\ 

284 ------------------------------------------------------------\n\ 

285 ------------------------------------------------------------\n\ 

286 ------------------------------------------------------------\n\ 

287 ------------------------------------------------------------\n\ 

288 -------------------\n" 

289 f_msa.write(fasta_text.encode()) 

290 f_msa.flush() 

291 

292 test_tree_annotated, annotated_prop2type = tree_annotate.run_tree_annotate(test_tree, 

293 alignment=f_msa.name, emapper_smart=f_smart.name 

294 ) 

295 

296 

297 expected_tree = "(1000565.METUNv1_03972:1[&&NHX:dom_arq=AAA@165@452||SRP54@168@530||ALAD@283@461||LIM@355@411||LytTR@370@502||VHP@427@461||SAF@438@525],(1007099.SAMN05216287:1,(1121400.SAMN02746065_101305:1[&&NHX:dom_arq=AAA@170@814||H4@437@532||MeTrc@478@960||FIST_C@588@999||SET@716@921||GHA@721@922||Cadherin_pro@809@959||IMPDH@811@1071||MoCF_biosynth@812@1025||ALAD@835@1052||PBP5_C@838@995||MAPKK1_Int@969@1059||BRIGHT@1092@1164],1009370.ALO_07448:1[&&NHX:dom_arq=AAA@165@478||SRP54@168@499||FtsA@185@478||DHDPS@220@575||DHHA2@359@575||ETF@363@603||GATase_5@409@710||MyTH4@414@605||Haem_bd@457@612||DSRM@477@580])Internal_1:0.5[&&NHX:dom_arq=AAA@170@814||H4@437@532||MeTrc@478@960||FIST_C@588@999||SET@716@921||GHA@721@922||Cadherin_pro@809@959||IMPDH@811@1071||MoCF_biosynth@812@1025||ALAD@835@1052||PBP5_C@838@995||MAPKK1_Int@969@1059||BRIGHT@1092@1164])Internal_2:0.5[&&NHX:dom_arq=none@none@none])Root[&&NHX:dom_arq=AAA@165@452||SRP54@168@530||ALAD@283@461||LIM@355@411||LytTR@370@502||VHP@427@461||SAF@438@525];" 

298 self.assertEqual(test_tree_annotated.write(props=["dom_arq"], parser=parser, format_root_node=True), expected_tree) 

299 

300if __name__ == '__main__': 

301 unittest.main() 

302#pytest.main(['-v'])