<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3.dtd">
<article article-type="research-article" dtd-version="1.3" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xml:lang="ru"><front><journal-meta><journal-id journal-id-type="publisher-id">vavilov</journal-id><journal-title-group><journal-title xml:lang="ru">Вавиловский журнал генетики и селекции</journal-title><trans-title-group xml:lang="en"><trans-title>Vavilov Journal of Genetics and Breeding</trans-title></trans-title-group></journal-title-group><issn pub-type="epub">2500-3259</issn><publisher><publisher-name>Institute of Cytology and Genetics of Siberian Branch of the RAS</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="doi">10.18699/vjgb-25-119</article-id><article-id custom-type="elpub" pub-id-type="custom">vavilov-4899</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Article</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="ru"><subject>МЕДИЦИНСКАЯ БИОИНФОРМАТИКА</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="en"><subject>MEDICAL BIOINFORMATICS</subject></subj-group></article-categories><title-group><article-title>OrthoML2GO: предсказание функций белков по гомологии с использованием ортогрупп и алгоритмов машинного обучения</article-title><trans-title-group xml:lang="en"><trans-title>OrthoML2GO: homology-based protein function prediction using orthogroups and machine learning</trans-title></trans-title-group></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Малюгин</surname><given-names>Е. В.</given-names></name><name name-style="western" xml:lang="en"><surname>Malyugin</surname><given-names>E. V.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Новосибирск</p></bio><bio xml:lang="en"><p>Novosibirsk</p></bio><email xlink:type="simple">evgeny.malyugin98@gmail.com</email><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0001-9738-1409</contrib-id><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Афонников</surname><given-names>Д. А.</given-names></name><name name-style="western" xml:lang="en"><surname>Afonnikov</surname><given-names>D. A.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Новосибирск</p></bio><bio xml:lang="en"><p>Novosibirsk</p></bio><xref ref-type="aff" rid="aff-2"/></contrib></contrib-group><aff-alternatives id="aff-1"><aff xml:lang="ru">Новосибирский национальный исследовательский государственный университет,<country>Россия</country></aff><aff xml:lang="en">Novosibirsk State University<country>Russian Federation</country></aff></aff-alternatives><aff-alternatives id="aff-2"><aff xml:lang="ru">Федеральный исследовательский центр Институт цитологии и генетики Сибирского отделения Российской академии наук<country>Россия</country></aff><aff xml:lang="en">Institute of Cytology and Genetics of the Siberian Branch of the Russian Academy of Sciences<country>Russian Federation</country></aff></aff-alternatives><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>12</day><month>12</month><year>2025</year></pub-date><volume>29</volume><issue>7</issue><fpage>1145</fpage><lpage>1154</lpage><permissions><copyright-statement>Copyright &amp;#x00A9; Малюгин Е.В., Афонников Д.А., 2025</copyright-statement><copyright-year>2025</copyright-year><copyright-holder xml:lang="ru">Малюгин Е.В., Афонников Д.А.</copyright-holder><copyright-holder xml:lang="en">Malyugin E.V., Afonnikov D.A.</copyright-holder><license license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>This work is licensed under a Creative Commons Attribution 4.0 License.</license-p></license></permissions><self-uri xlink:href="https://vavilov.elpub.ru/jour/article/view/4899">https://vavilov.elpub.ru/jour/article/view/4899</self-uri><abstract><p>   В последние годы быстрый рост объемов данных секвенирования обострил проблему функциональной аннотации белковых последовательностей, поскольку традиционные методы, основанные на гомологии, сталкиваются с ограничениями при работе с отдаленными гомологами, что затрудняет наиболее точное определение функций белков. В нашей работе представлен метод предсказания функций белков OrthoML2GO, который интегрирует поиск гомологичных последовательностей с помощью алгоритма USEARCH, анализ ортогрупп на базе OrthoDB 12-й версии и алгоритм машинного обучения (градиентный бустинг).</p><p>   Ключевая особенность подхода заключается в использовании информации об ортогруппах для учета эволюционного и функционального сходства белков и применения машинного обучения для дальнейшего уточнения терминов Gene Ontology (GO) для анализируемой последовательности.</p><p>   Для выбора оптимального алгоритма аннотации белков были поэтапно применены следующие подходы: метод k ближайших соседей (KNN); метод на основе аннотации ортогруппы, наиболее представленной у k ближайших гомологов (OG); метод верификации выявленных на предыдущем этапе терминов GO с помощью алгоритмов машинного обучения. Проведено сравнение точности предсказания терминов GO методом OrthoML2GO с программами аннотации Blast2GO и PANNZER2 на выборках последовательностей как отдельных организмов (человек, арабидопсис), так и на комбинированной выборке последовательностей, представленных разными таксонами. Результаты показали, что предложенный метод не уступает, а по некоторым показателям превосходит их по качеству предсказания функций белков, особенно на больших и разнородных выборках организмов, а наибольший прирост точности достигается за счет комбинации информации о ближайших гомологах и ортогруппах в сочетании с верификацией терминов методами машинного обучения. Разработанный подход демонстрирует высокую эффективность для крупномасштабной автоматической аннотации белков. Перспективы дальнейшего развития включают оптимизацию параметров моделей машинного обучения под конкретные биологические задачи и интеграцию дополнительных источников структурно-функциональной информации, что позволит еще больше повысить точность и универсальность метода. Кроме того, внедрение новых инструментов биоинформатики и расширение базы данных аннотированных белков будут способствовать дальнейшему совершенствованию предложенного подхода.</p></abstract><trans-abstract xml:lang="en"><p>   In recent years, the rapid growth of sequencing data has exacerbated the problem of functional annotation of protein sequences, as traditional homology-based methods face limitations when working with distant homologs, making it difficult to accurately determine protein functions. This paper introduces the OrthoML2GO method for protein function prediction, which integrates homology searches using the USEARCH algorithm, orthogroup analysis based on OrthoDB version 12.0, and a machine learning algorithm (gradient boosting).</p><p>   A key feature of our approach is the use of orthogroup information to account for the evolutionary and functional similarity of proteins and the application of machine learning to refine the assigned GO terms for the target sequence.</p><p>   To select the optimal algorithm for protein annotation, the following approaches were applied sequentially: the k-nearest neighbors (KNN) method; a method based on the annotation of the orthogroup most represented in the k-nearest homologs (OG); a method of verifying the GO terms identified in the previous stage using machine learning algorithms. A comparison of the prediction accuracy of GO terms using the OrthoML2GO method with the Blast2GO and PANNZER2 annotation programs was performed on sequence samples from both individual organisms (humans, Arabidopsis) and a combined sample represented by different taxa. Our results demonstrate that the proposed method is comparable to, and by some evaluation metrics outperforms, these existing methods in terms of the quality of protein function prediction, especially on large and heterogeneous samples of organisms. The greatest performance improvement is achieved by combining information about the closest homologs and orthogroups with verification of terms using machine learning methods. Our approach demonstrates high performance for large-scale automatic protein annotation, and prospects for further development include optimizing machine learning model parameters for specific biological tasks and integrating additional sources of structural and functional information, which will further improve the method’s accuracy and versatility. In addition, the introduction of new bioinformatics tools and the expansion of the annotated protein database will contribute to the further improvement of the proposed approach.</p></trans-abstract><kwd-group xml:lang="ru"><kwd>предсказание функций белка</kwd><kwd>генная онтология</kwd><kwd>гомология</kwd><kwd>ортогруппа</kwd><kwd>машинное обучение</kwd></kwd-group><kwd-group xml:lang="en"><kwd>protein function prediction</kwd><kwd>gene ontology</kwd><kwd>homology</kwd><kwd>orthogroup</kwd><kwd>machine learning</kwd></kwd-group><funding-group xml:lang="en"><funding-statement>The work was supported by the Kurchatov Genomic Center of ICG SB RAS under agreement with the Ministry of Science and Higher Education of the Russian Federation No. 075-15-2019-1662, and by the state budget project No. FWNR-2022-0020</funding-statement></funding-group></article-meta></front><back><ref-list><title>References</title><ref id="cit1"><label>1</label><citation-alternatives><mixed-citation xml:lang="ru">Altenhoff A.M., Glover N.M., Dessimoz C. Inferring orthology and paralogy. Methods Mol Biol. 2019;1910:149-175. doi: 10.1007/978-1-4939-9074-0_5</mixed-citation><mixed-citation xml:lang="en">Altenhoff A.M., Glover N.M., Dessimoz C. Inferring orthology and paralogy. Methods Mol Biol. 2019;1910:149-175. doi: 10.1007/978-1-4939-9074-0_5</mixed-citation></citation-alternatives></ref><ref id="cit2"><label>2</label><citation-alternatives><mixed-citation xml:lang="ru">Altschul S.F., Gish W., Miller W., Myers E.W., Lipman D.J. Basic local alignment search tool. J Mol Biol. 1990;215(3):403-410. doi: 10.1016/S0022-2836(05)80360-2</mixed-citation><mixed-citation xml:lang="en">Altschul S.F., Gish W., Miller W., Myers E.W., Lipman D.J. Basic local alignment search tool. J Mol Biol. 1990;215(3):403-410. doi: 10.1016/S0022-2836(05)80360-2</mixed-citation></citation-alternatives></ref><ref id="cit3"><label>3</label><citation-alternatives><mixed-citation xml:lang="ru">Ashburner M., Ball C.A., Blake J.A., Botstein D., Butler H., Cherry J.M., Davis A.P., … Matese J.C., Richardson J.E., Ringwald M., Rubin G.M., Sherlock G. Gene ontology: tool for the unification of biology. The Gene Ontology Consortium. Nat Genet. 2000;25(1): 25-29. doi: 10.1038/75556</mixed-citation><mixed-citation xml:lang="en">Ashburner M., Ball C.A., Blake J.A., Botstein D., Butler H., Cherry J.M., Davis A.P., … Matese J.C., Richardson J.E., Ringwald M., Rubin G.M., Sherlock G. Gene ontology: tool for the unification of biology. The Gene Ontology Consortium. Nat Genet. 2000;25(1): 25-29. doi: 10.1038/75556</mixed-citation></citation-alternatives></ref><ref id="cit4"><label>4</label><citation-alternatives><mixed-citation xml:lang="ru">Benso A., Di Carlo S., Ur Rehman H., Politano G., Savino A., Suravajhala P. A combined approach for genome wide protein function annotation/prediction. Proteome Sci. 2013;11(Suppl. 1):S1. doi: 10.1186/1477-5956-11-S1-S1</mixed-citation><mixed-citation xml:lang="en">Benso A., Di Carlo S., Ur Rehman H., Politano G., Savino A., Suravajhala P. A combined approach for genome wide protein function annotation/prediction. Proteome Sci. 2013;11(Suppl. 1):S1. doi: 10.1186/1477-5956-11-S1-S1</mixed-citation></citation-alternatives></ref><ref id="cit5"><label>5</label><citation-alternatives><mixed-citation xml:lang="ru">Bradford Y.M., Van Slyke C.E., Ruzicka L., Singer A., Eagle A., Fashena D., Howe D.G., Frazer K., Martin R., Paddock H., Pich C., Ramachandran S., Westerfield M. Zebrafish information network, the knowledgebase for Danio rerio research. Genetics. 2022;220(4): iyac016. doi: 10.1093/genetics/iyac016</mixed-citation><mixed-citation xml:lang="en">Bradford Y.M., Van Slyke C.E., Ruzicka L., Singer A., Eagle A., Fashena D., Howe D.G., Frazer K., Martin R., Paddock H., Pich C., Ramachandran S., Westerfield M. Zebrafish information network, the knowledgebase for Danio rerio research. Genetics. 2022;220(4): iyac016. doi: 10.1093/genetics/iyac016</mixed-citation></citation-alternatives></ref><ref id="cit6"><label>6</label><citation-alternatives><mixed-citation xml:lang="ru">Buchfink B., Xie C., Huson D.H. Fast and sensitive protein alignment using DIAMOND. Nat Methods. 2015;12(1):59-60. doi: 10.1038/nmeth.3176</mixed-citation><mixed-citation xml:lang="en">Buchfink B., Xie C., Huson D.H. Fast and sensitive protein alignment using DIAMOND. Nat Methods. 2015;12(1):59-60. doi: 10.1038/nmeth.3176</mixed-citation></citation-alternatives></ref><ref id="cit7"><label>7</label><citation-alternatives><mixed-citation xml:lang="ru">Cao Y., Shen Y. TALE: Transformer-based protein function Annotation with joint sequence-Label Embedding. Bioinformatics. 2021; 37(18):2825-2833. doi: 10.1093/bioinformatics/btab198</mixed-citation><mixed-citation xml:lang="en">Cao Y., Shen Y. TALE: Transformer-based protein function Annotation with joint sequence-Label Embedding. Bioinformatics. 2021; 37(18):2825-2833. doi: 10.1093/bioinformatics/btab198</mixed-citation></citation-alternatives></ref><ref id="cit8"><label>8</label><citation-alternatives><mixed-citation xml:lang="ru">Chen T., Guestrin C. XGBoost: A Scalable Tree Boosting System. In: KDD ‘16. Proceedings of the 22&lt;sup&gt;nd&lt;/sup&gt; ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. New York, NY, USA: Association for Computing Machinery, 2016;785-794. doi: 10.1145/2939672.2939785</mixed-citation><mixed-citation xml:lang="en">Chen T., Guestrin C. XGBoost: A Scalable Tree Boosting System. In: KDD ‘16. Proceedings of the 22&lt;sup&gt;nd&lt;/sup&gt; ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. New York, NY, USA: Association for Computing Machinery, 2016;785-794. doi: 10.1145/2939672.2939785</mixed-citation></citation-alternatives></ref><ref id="cit9"><label>9</label><citation-alternatives><mixed-citation xml:lang="ru">Cheng S., Melkonian M., Smith S.A., Brockington S., Archibald J.M., Delaux P.M., Li F.W., … Graham S.W., Soltis P.S., Liu X., Xu X., Wong G.K. 10KP: A phylodiverse genome sequencing plan. Giga-science. 2018;7(3):1-9. doi: 10.1093/gigascience/giy013</mixed-citation><mixed-citation xml:lang="en">Cheng S., Melkonian M., Smith S.A., Brockington S., Archibald J.M., Delaux P.M., Li F.W., … Graham S.W., Soltis P.S., Liu X., Xu X., Wong G.K. 10KP: A phylodiverse genome sequencing plan. Giga-science. 2018;7(3):1-9. doi: 10.1093/gigascience/giy013</mixed-citation></citation-alternatives></ref><ref id="cit10"><label>10</label><citation-alternatives><mixed-citation xml:lang="ru">Conesa A., Götz S., García-Gómez J.M., Terol J., Talón M., Robles M. Blast2GO: a universal tool for annotation, visualization and analysis in functional genomics research. Bioinformatics. 2005;21(18): 3674-3676. doi: 10.1093/bioinformatics/bti610</mixed-citation><mixed-citation xml:lang="en">Conesa A., Götz S., García-Gómez J.M., Terol J., Talón M., Robles M. Blast2GO: a universal tool for annotation, visualization and analysis in functional genomics research. Bioinformatics. 2005;21(18): 3674-3676. doi: 10.1093/bioinformatics/bti610</mixed-citation></citation-alternatives></ref><ref id="cit11"><label>11</label><citation-alternatives><mixed-citation xml:lang="ru">Dongardive J., Abraham S. Protein Sequence Classification Based on N-Gram and K-Nearest Neighbor Algorithm. In: Behera H., Mohapatra D. (Eds). Computational Intelligence in Data Mining. Vol. 2. Advances in Intelligent Systems and Computing. Vol. 411. Springer, New Delhi, 2016;163-171 doi: 10.1007/978-81-322-2731-1_15</mixed-citation><mixed-citation xml:lang="en">Dongardive J., Abraham S. Protein Sequence Classification Based on N-Gram and K-Nearest Neighbor Algorithm. In: Behera H., Mohapatra D. (Eds). Computational Intelligence in Data Mining. Vol. 2. Advances in Intelligent Systems and Computing. Vol. 411. Springer, New Delhi, 2016;163-171 doi: 10.1007/978-81-322-2731-1_15</mixed-citation></citation-alternatives></ref><ref id="cit12"><label>12</label><citation-alternatives><mixed-citation xml:lang="ru">du Plessis L., Skunca N., Dessimoz C. The what, where, how and why of gene ontology – a primer for bioinformaticians. Brief Bioinform. 2011;12(6):723-735. doi: 10.1093/bib/bbr002</mixed-citation><mixed-citation xml:lang="en">du Plessis L., Skunca N., Dessimoz C. The what, where, how and why of gene ontology – a primer for bioinformaticians. Brief Bioinform. 2011;12(6):723-735. doi: 10.1093/bib/bbr002</mixed-citation></citation-alternatives></ref><ref id="cit13"><label>13</label><citation-alternatives><mixed-citation xml:lang="ru">Edgar R.C. Search and clustering orders of magnitude faster than BLAST. Bioinformatics. 2010;26(19):2460-2461. doi: 10.1093/bioinformatics/btq461</mixed-citation><mixed-citation xml:lang="en">Edgar R.C. Search and clustering orders of magnitude faster than BLAST. Bioinformatics. 2010;26(19):2460-2461. doi: 10.1093/bioinformatics/btq461</mixed-citation></citation-alternatives></ref><ref id="cit14"><label>14</label><citation-alternatives><mixed-citation xml:lang="ru">Eisenberg D., Marcotte E.M., Xenarios I., Yeates T.O. Protein function in the post-genomic era. Nature. 2000;405(6788):823-826. doi: 10.1038/35015694</mixed-citation><mixed-citation xml:lang="en">Eisenberg D., Marcotte E.M., Xenarios I., Yeates T.O. Protein function in the post-genomic era. Nature. 2000;405(6788):823-826. doi: 10.1038/35015694</mixed-citation></citation-alternatives></ref><ref id="cit15"><label>15</label><citation-alternatives><mixed-citation xml:lang="ru">Fitch W.M. Distinguishing homologous from analogous proteins. Syst Biol. 1970;19(2):99-113. doi: 10.2307/2412448</mixed-citation><mixed-citation xml:lang="en">Fitch W.M. Distinguishing homologous from analogous proteins. Syst Biol. 1970;19(2):99-113. doi: 10.2307/2412448</mixed-citation></citation-alternatives></ref><ref id="cit16"><label>16</label><citation-alternatives><mixed-citation xml:lang="ru">Fitch W.M. Homology a personal view on some of the problems. Trends Genet. 2000;16(5):227-231. doi: 10.1016/s0168-9525(00)02005-9</mixed-citation><mixed-citation xml:lang="en">Fitch W.M. Homology a personal view on some of the problems. Trends Genet. 2000;16(5):227-231. doi: 10.1016/s0168-9525(00)02005-9</mixed-citation></citation-alternatives></ref><ref id="cit17"><label>17</label><citation-alternatives><mixed-citation xml:lang="ru">Galperin M.Y., Koonin E.V. From complete genome sequence to ‘complete’ understanding? Trends Biotechnol. 2010;28(8):398-406. doi: 10.1016/j.tibtech.2010.05.006</mixed-citation><mixed-citation xml:lang="en">Galperin M.Y., Koonin E.V. From complete genome sequence to ‘complete’ understanding? Trends Biotechnol. 2010;28(8):398-406. doi: 10.1016/j.tibtech.2010.05.006</mixed-citation></citation-alternatives></ref><ref id="cit18"><label>18</label><citation-alternatives><mixed-citation xml:lang="ru">Gene Ontology Consortium; Aleksander S.A., Balhoff J., Carbon S., Cherry J.M., Drabkin H.J., Ebert D., ... Ponferrada V., Zorn A., Ramachandran S., Ruzicka L., Westerfield M. The Gene Ontology knowledgebase in 2023. Genetics. 2023;224(1):iyad031. doi: 10.1093/genetics/iyad031</mixed-citation><mixed-citation xml:lang="en">Gene Ontology Consortium; Aleksander S.A., Balhoff J., Carbon S., Cherry J.M., Drabkin H.J., Ebert D., ... Ponferrada V., Zorn A., Ramachandran S., Ruzicka L., Westerfield M. The Gene Ontology knowledgebase in 2023. Genetics. 2023;224(1):iyad031. doi: 10.1093/genetics/iyad031</mixed-citation></citation-alternatives></ref><ref id="cit19"><label>19</label><citation-alternatives><mixed-citation xml:lang="ru">Goodwin S., McPherson J.D., McCombie W.R. Coming of age: ten years of next-generation sequencing technologies. Nat Rev Genet. 2016;17(6):333-351. doi: 10.1038/nrg.2016.49</mixed-citation><mixed-citation xml:lang="en">Goodwin S., McPherson J.D., McCombie W.R. Coming of age: ten years of next-generation sequencing technologies. Nat Rev Genet. 2016;17(6):333-351. doi: 10.1038/nrg.2016.49</mixed-citation></citation-alternatives></ref><ref id="cit20"><label>20</label><citation-alternatives><mixed-citation xml:lang="ru">Grigoriev I.V., Hayes R.D., Calhoun S., Kamel B., Wang A., Ahrendt S., Dusheyko S., Nikitin R., Mondo S.J., Salamov A., Shabalov I., Kuo A. PhycoCosm, a comparative algal genomics resource. Nucleic Acids Res. 2021;49(D1):1004-1011. doi: 10.1093/nar/gkaa898</mixed-citation><mixed-citation xml:lang="en">Grigoriev I.V., Hayes R.D., Calhoun S., Kamel B., Wang A., Ahrendt S., Dusheyko S., Nikitin R., Mondo S.J., Salamov A., Shabalov I., Kuo A. PhycoCosm, a comparative algal genomics resource. Nucleic Acids Res. 2021;49(D1):1004-1011. doi: 10.1093/nar/gkaa898</mixed-citation></citation-alternatives></ref><ref id="cit21"><label>21</label><citation-alternatives><mixed-citation xml:lang="ru">Hamilton J.P., Brose J., Buell C.R. SpudDB: a database for accessing potato genomic data. Genetics. 2025a;229(3):iyae205. doi: 10.1093/genetics/iyae205</mixed-citation><mixed-citation xml:lang="en">Hamilton J.P., Brose J., Buell C.R. SpudDB: a database for accessing potato genomic data. Genetics. 2025a;229(3):iyae205. doi: 10.1093/genetics/iyae205</mixed-citation></citation-alternatives></ref><ref id="cit22"><label>22</label><citation-alternatives><mixed-citation xml:lang="ru">Hamilton J.P., Li C., Buell C.R. The rice genome annotation project: an updated database for mining the rice genome. Nucleic Acids Res. 2025b;53(1):1614-1622. doi: 10.1093/nar/gkae1061</mixed-citation><mixed-citation xml:lang="en">Hamilton J.P., Li C., Buell C.R. The rice genome annotation project: an updated database for mining the rice genome. Nucleic Acids Res. 2025b;53(1):1614-1622. doi: 10.1093/nar/gkae1061</mixed-citation></citation-alternatives></ref><ref id="cit23"><label>23</label><citation-alternatives><mixed-citation xml:lang="ru">Huntley R.P., Sawford T., Mutowo-Meullenet P., Shypitsyna A., Bonilla C., Martin M.J., O’Donovan C. The GOA database: Gene Ontology annotation updates for 2015. Nucleic Acids Res. 2015; 43(D1):1057-1063. doi: 10.1093/nar/gku1113</mixed-citation><mixed-citation xml:lang="en">Huntley R.P., Sawford T., Mutowo-Meullenet P., Shypitsyna A., Bonilla C., Martin M.J., O’Donovan C. The GOA database: Gene Ontology annotation updates for 2015. Nucleic Acids Res. 2015; 43(D1):1057-1063. doi: 10.1093/nar/gku1113</mixed-citation></citation-alternatives></ref><ref id="cit24"><label>24</label><citation-alternatives><mixed-citation xml:lang="ru">Jensen L.J., Julien P., Kuhn M., von Mering C., Muller J., Doerks T., Bork P. eggNOG: automated construction and annotation of orthologous groups of genes. Nucleic Acids Res. 2008;36(Database issue): 250-254. doi: 10.1093/nar/gkm796</mixed-citation><mixed-citation xml:lang="en">Jensen L.J., Julien P., Kuhn M., von Mering C., Muller J., Doerks T., Bork P. eggNOG: automated construction and annotation of orthologous groups of genes. Nucleic Acids Res. 2008;36(Database issue): 250-254. doi: 10.1093/nar/gkm796</mixed-citation></citation-alternatives></ref><ref id="cit25"><label>25</label><citation-alternatives><mixed-citation xml:lang="ru">Kharsikar S., Mugler D., Sheffer D., Moore F., Duan Z.H. A weighted k-nearest neighbor method for gene ontology based protein function prediction. In: Proceedings of the Second International Multi-Symposiums on Computer and Computational Sciences (IMSCCS ‘07). IEEE Computer Society, USA, 2007;25-31. doi: 10.1109/IMSCCS.2007.13</mixed-citation><mixed-citation xml:lang="en">Kharsikar S., Mugler D., Sheffer D., Moore F., Duan Z.H. A weighted k-nearest neighbor method for gene ontology based protein function prediction. In: Proceedings of the Second International Multi-Symposiums on Computer and Computational Sciences (IMSCCS ‘07). IEEE Computer Society, USA, 2007;25-31. doi: 10.1109/IMSCCS.2007.13</mixed-citation></citation-alternatives></ref><ref id="cit26"><label>26</label><citation-alternatives><mixed-citation xml:lang="ru">Kriventseva E.V., Rahman N., Espinosa O., Zdobnov E.M. OrthoDB: the hierarchical catalog of eukaryotic orthologs. Nucleic Acids Res. 2008;36(Database issue):271-275. doi: 10.1093/nar/gkm845</mixed-citation><mixed-citation xml:lang="en">Kriventseva E.V., Rahman N., Espinosa O., Zdobnov E.M. OrthoDB: the hierarchical catalog of eukaryotic orthologs. Nucleic Acids Res. 2008;36(Database issue):271-275. doi: 10.1093/nar/gkm845</mixed-citation></citation-alternatives></ref><ref id="cit27"><label>27</label><citation-alternatives><mixed-citation xml:lang="ru">Kulmanov M., Hoehndorf R. DeepGOPlus: improved protein function prediction from sequence. Bioinformatics. 2020;36(2):422-429. doi: 10.1093/bioinformatics/btz595</mixed-citation><mixed-citation xml:lang="en">Kulmanov M., Hoehndorf R. DeepGOPlus: improved protein function prediction from sequence. Bioinformatics. 2020;36(2):422-429. doi: 10.1093/bioinformatics/btz595</mixed-citation></citation-alternatives></ref><ref id="cit28"><label>28</label><citation-alternatives><mixed-citation xml:lang="ru">Kuzniar A., van Ham R.C., Pongor S., Leunissen J.A. The quest for orthologs: finding the corresponding gene across genomes. Trends Genet. 2008;24(11):539-551. doi: 10.1016/j.tig.2008.08.009</mixed-citation><mixed-citation xml:lang="en">Kuzniar A., van Ham R.C., Pongor S., Leunissen J.A. The quest for orthologs: finding the corresponding gene across genomes. Trends Genet. 2008;24(11):539-551. doi: 10.1016/j.tig.2008.08.009</mixed-citation></citation-alternatives></ref><ref id="cit29"><label>29</label><citation-alternatives><mixed-citation xml:lang="ru">Lewin H.A., Robinson G.E., Kress W.J., Baker W.J., Coddington J., Crandall K.A., Durbin R., …van Sluys M.A., Soltis P.S., Xu X., Yang H., Zhang G. Earth BioGenome project: Sequencing life for the future of life. Proc Natl Acad Sci USA. 2018;115(17):4325-4333. doi: 10.1073/pnas.1720115115</mixed-citation><mixed-citation xml:lang="en">Lewin H.A., Robinson G.E., Kress W.J., Baker W.J., Coddington J., Crandall K.A., Durbin R., …van Sluys M.A., Soltis P.S., Xu X., Yang H., Zhang G. Earth BioGenome project: Sequencing life for the future of life. Proc Natl Acad Sci USA. 2018;115(17):4325-4333. doi: 10.1073/pnas.1720115115</mixed-citation></citation-alternatives></ref><ref id="cit30"><label>30</label><citation-alternatives><mixed-citation xml:lang="ru">Liaw A., Wiener M. Classification and Regression by randomForest. R News. 2002;2(3):18-22. doi: 10.32614/CRAN.package.random</mixed-citation><mixed-citation xml:lang="en">Liaw A., Wiener M. Classification and Regression by randomForest. R News. 2002;2(3):18-22. doi: 10.32614/CRAN.package.random</mixed-citation></citation-alternatives></ref><ref id="cit31"><label>31</label><citation-alternatives><mixed-citation xml:lang="ru">Forest Öztürk-Çolak A., Marygold S.J., Antonazzo G., Attrill H., Goutte-Gattat D., Jenkins V.K., Matthews B.B., Millburn G., Dos Santos G., Tabone C.J.; FlyBase Consortium. FlyBase: updates to the Drosophila genes and genomes database. Genetics. 2024;227(1):iyad211. doi: 10.1093/genetics/iyad211</mixed-citation><mixed-citation xml:lang="en">Forest Öztürk-Çolak A., Marygold S.J., Antonazzo G., Attrill H., Goutte-Gattat D., Jenkins V.K., Matthews B.B., Millburn G., Dos Santos G., Tabone C.J.; FlyBase Consortium. FlyBase: updates to the Drosophila genes and genomes database. Genetics. 2024;227(1):iyad211. doi: 10.1093/genetics/iyad211</mixed-citation></citation-alternatives></ref><ref id="cit32"><label>32</label><citation-alternatives><mixed-citation xml:lang="ru">Pearson W.R. An introduction to sequence similarity (“homology”) searching. Curr Protoc Bioinformatics. 2013;42(3):3.1.1-3.1.8. doi: 10.1002/0471250953.bi0301s42</mixed-citation><mixed-citation xml:lang="en">Pearson W.R. An introduction to sequence similarity (“homology”) searching. Curr Protoc Bioinformatics. 2013;42(3):3.1.1-3.1.8. doi: 10.1002/0471250953.bi0301s42</mixed-citation></citation-alternatives></ref><ref id="cit33"><label>33</label><citation-alternatives><mixed-citation xml:lang="ru">R Core Team. R: A language and environment for statistical computing. R Foundation for Statistical Computing. Vienna, 2013. Available: http://www.R-project.org/</mixed-citation><mixed-citation xml:lang="en">R Core Team. R: A language and environment for statistical computing. R Foundation for Statistical Computing. Vienna, 2013. Available: http://www.R-project.org/</mixed-citation></citation-alternatives></ref><ref id="cit34"><label>34</label><citation-alternatives><mixed-citation xml:lang="ru">Reiser L., Bakker E., Subramaniam S., Chen X., Sawant S., Khosa K., Prithvi T., Berardini T.Z. The Arabidopsis Information Resource in 2024. Genetics. 2024;227(1):iyae027. doi: 10.1093/genetics/iyae027</mixed-citation><mixed-citation xml:lang="en">Reiser L., Bakker E., Subramaniam S., Chen X., Sawant S., Khosa K., Prithvi T., Berardini T.Z. The Arabidopsis Information Resource in 2024. Genetics. 2024;227(1):iyae027. doi: 10.1093/genetics/iyae027</mixed-citation></citation-alternatives></ref><ref id="cit35"><label>35</label><citation-alternatives><mixed-citation xml:lang="ru">Sanderson T., Bileschi M.L., Belanger D., Colwell L.J. ProteInfer, deep neural networks for protein functional inference. eLife. 2023;12: e80942. doi: 10.7554/eLife.80942</mixed-citation><mixed-citation xml:lang="en">Sanderson T., Bileschi M.L., Belanger D., Colwell L.J. ProteInfer, deep neural networks for protein functional inference. eLife. 2023;12: e80942. doi: 10.7554/eLife.80942</mixed-citation></citation-alternatives></ref><ref id="cit36"><label>36</label><citation-alternatives><mixed-citation xml:lang="ru">Steinegger M., Söding J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nat Biotechnol. 2017;35(11):1026-1028. doi: 10.1038/nbt.3988</mixed-citation><mixed-citation xml:lang="en">Steinegger M., Söding J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nat Biotechnol. 2017;35(11):1026-1028. doi: 10.1038/nbt.3988</mixed-citation></citation-alternatives></ref><ref id="cit37"><label>37</label><citation-alternatives><mixed-citation xml:lang="ru">Suzuki S., Kakuta M., Ishida T., Akiyama Y. GHOSTX: an improved sequence homology search algorithm using a query suffix array and a database suffix array. PLoS One. 2014;9(8):e103833. doi: 10.1371/journal.pone.0103833</mixed-citation><mixed-citation xml:lang="en">Suzuki S., Kakuta M., Ishida T., Akiyama Y. GHOSTX: an improved sequence homology search algorithm using a query suffix array and a database suffix array. PLoS One. 2014;9(8):e103833. doi: 10.1371/journal.pone.0103833</mixed-citation></citation-alternatives></ref><ref id="cit38"><label>38</label><citation-alternatives><mixed-citation xml:lang="ru">Tegenfeldt F., Kuznetsov D., Manni M., Berkeley M., Zdobnov E.M., Kriventseva E.V. OrthoDB and BUSCO update: annotation of orthologs with wider sampling of genomes. Nucleic Acids Res. 2025; 53(D1):D516-D522. doi: 10.1093/nar/gkae987</mixed-citation><mixed-citation xml:lang="en">Tegenfeldt F., Kuznetsov D., Manni M., Berkeley M., Zdobnov E.M., Kriventseva E.V. OrthoDB and BUSCO update: annotation of orthologs with wider sampling of genomes. Nucleic Acids Res. 2025; 53(D1):D516-D522. doi: 10.1093/nar/gkae987</mixed-citation></citation-alternatives></ref><ref id="cit39"><label>39</label><citation-alternatives><mixed-citation xml:lang="ru">Törönen P., Medlar A., Holm L. PANNZER2: a rapid functional annotation web server. Nucleic Acids Res. 2018;46(W1):W84-W88. doi: 10.1093/nar/gky350</mixed-citation><mixed-citation xml:lang="en">Törönen P., Medlar A., Holm L. PANNZER2: a rapid functional annotation web server. Nucleic Acids Res. 2018;46(W1):W84-W88. doi: 10.1093/nar/gky350</mixed-citation></citation-alternatives></ref><ref id="cit40"><label>40</label><citation-alternatives><mixed-citation xml:lang="ru">Wickham H., François R., Henry L., Müller K., Vaughan D. dplyr: A Grammar of Data Manipulation. R package version 1.1.4. 2025. doi: 10.32614/CRAN.package.dplyr</mixed-citation><mixed-citation xml:lang="en">Wickham H., François R., Henry L., Müller K., Vaughan D. dplyr: A Grammar of Data Manipulation. R package version 1.1.4. 2025. doi: 10.32614/CRAN.package.dplyr</mixed-citation></citation-alternatives></ref><ref id="cit41"><label>41</label><citation-alternatives><mixed-citation xml:lang="ru">Yao S., You R., Wang S., Xiong Y., Huang X., Zhu S. NetGO 2.0: improving large-scale protein function prediction with massive sequence, text, domain, family and network information. Nucleic Acids Res. 2021;49(W1):W469-W475. doi: 10.1093/nar/gkab398</mixed-citation><mixed-citation xml:lang="en">Yao S., You R., Wang S., Xiong Y., Huang X., Zhu S. NetGO 2.0: improving large-scale protein function prediction with massive sequence, text, domain, family and network information. Nucleic Acids Res. 2021;49(W1):W469-W475. doi: 10.1093/nar/gkab398</mixed-citation></citation-alternatives></ref><ref id="cit42"><label>42</label><citation-alternatives><mixed-citation xml:lang="ru">You R., Zhang Z., Xiong Y., Sun F., Mamitsuka H., Zhu S. GOLabeler: improving sequence-based large-scale protein function prediction by learning to rank. Bioinformatics. 2018;34(14):2465-2473. doi: 10.1093/bioinformatics/bty130</mixed-citation><mixed-citation xml:lang="en">You R., Zhang Z., Xiong Y., Sun F., Mamitsuka H., Zhu S. GOLabeler: improving sequence-based large-scale protein function prediction by learning to rank. Bioinformatics. 2018;34(14):2465-2473. doi: 10.1093/bioinformatics/bty130</mixed-citation></citation-alternatives></ref><ref id="cit43"><label>43</label><citation-alternatives><mixed-citation xml:lang="ru">Yuan Q., Xie J., Xie J., Zhao H., Yang Y. Fast and accurate protein function prediction from sequence through pretrained language model and homology-based label diffusion. Brief Bioinform. 2023;24(3): bbad117. doi: 10.1093/bib/bbad117</mixed-citation><mixed-citation xml:lang="en">Yuan Q., Xie J., Xie J., Zhao H., Yang Y. Fast and accurate protein function prediction from sequence through pretrained language model and homology-based label diffusion. Brief Bioinform. 2023;24(3): bbad117. doi: 10.1093/bib/bbad117</mixed-citation></citation-alternatives></ref></ref-list><fn-group><fn fn-type="conflict"><p>The authors declare that there are no conflicts of interest present.</p></fn></fn-group></back></article>
