<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3.dtd">
<article article-type="research-article" dtd-version="1.3" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xml:lang="ru"><front><journal-meta><journal-id journal-id-type="publisher-id">vavilov</journal-id><journal-title-group><journal-title xml:lang="ru">Вавиловский журнал генетики и селекции</journal-title><trans-title-group xml:lang="en"><trans-title>Vavilov Journal of Genetics and Breeding</trans-title></trans-title-group></journal-title-group><issn pub-type="epub">2500-3259</issn><publisher><publisher-name>Institute of Cytology and Genetics of Siberian Branch of the RAS</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="doi">10.18699/VJ19.584</article-id><article-id custom-type="elpub" pub-id-type="custom">vavilov-2395</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Article</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="ru"><subject>МОЛЕКУЛЯРНЫЕ МАРКЕРЫ В ГЕНЕТИКЕ И СЕЛЕКЦИИ</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="en"><subject>BIOINFORMATICS AND SYSTEM BIOLOGY</subject></subj-group></article-categories><title-group><article-title>Метод главных компонент и его обобщения для последовательности любого типа (PCA-Seq)</article-title><trans-title-group xml:lang="en"><trans-title>Principal component analysis and its generalizations for any type of sequence (PCA-Seq)</trans-title></trans-title-group></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Ефимов</surname><given-names>В. М.</given-names></name><name name-style="western" xml:lang="en"><surname>Efimov</surname><given-names>V. M.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Новосибирск</p></bio><bio xml:lang="en"><p>Novosibirsk</p></bio><email xlink:type="simple">efimov@bionet.nsc.ru</email><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Ефимов</surname><given-names>К. В.</given-names></name><name name-style="western" xml:lang="en"><surname>Efimov</surname><given-names>K. V.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Москва</p></bio><bio xml:lang="en"><p>Moscow</p></bio><xref ref-type="aff" rid="aff-2"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Ковалева</surname><given-names>В. Ю.</given-names></name><name name-style="western" xml:lang="en"><surname>Kovaleva</surname><given-names>V. Y.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Новосибирск</p></bio><bio xml:lang="en"><p>Novosibirsk</p></bio><xref ref-type="aff" rid="aff-3"/></contrib></contrib-group><aff-alternatives id="aff-1"><aff xml:lang="ru">Федеральный исследовательский центр Институт цитологии и генетики Сибирского отделения Российской академии наук; Институт систематики и экологии животных Сибирского отделения Российской академии наук; Новосибирский государственный университет; Томский государственный университет<country>Россия</country></aff><aff xml:lang="en">Institute of Cytology and Genetics, SB RAS; Institute of Systematics and Ecology of Animals, SB RAS; Novosibirsk State University; Tomsk State University<country>Russian Federation</country></aff></aff-alternatives><aff-alternatives id="aff-2"><aff xml:lang="ru">Московский физико-технический институт (государственный университет)<country>Россия</country></aff><aff xml:lang="en">Moscow Institute of Physics and Technology (State University)<country>Russian Federation</country></aff></aff-alternatives><aff-alternatives id="aff-3"><aff xml:lang="ru">Институт систематики и экологии животных Сибирского отделения Российской академии наук<country>Россия</country></aff><aff xml:lang="en">Institute of Systematics and Ecology of Animals, SB RAS<country>Russian Federation</country></aff></aff-alternatives><pub-date pub-type="collection"><year>2019</year></pub-date><pub-date pub-type="epub"><day>09</day><month>01</month><year>2020</year></pub-date><volume>23</volume><issue>8</issue><fpage>1032</fpage><lpage>1036</lpage><permissions><copyright-statement>Copyright &amp;#x00A9; Ефимов В.М., Ефимов К.В., Ковалева В.Ю., 2020</copyright-statement><copyright-year>2020</copyright-year><copyright-holder xml:lang="ru">Ефимов В.М., Ефимов К.В., Ковалева В.Ю.</copyright-holder><copyright-holder xml:lang="en">Efimov V.M., Efimov K.V., Kovaleva V.Y.</copyright-holder><license license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>This work is licensed under a Creative Commons Attribution 4.0 License.</license-p></license></permissions><self-uri xlink:href="https://vavilov.elpub.ru/jour/article/view/2395">https://vavilov.elpub.ru/jour/article/view/2395</self-uri><abstract><p>В 1940-х гг. К. Карунен и М. Лоев предложили метод обработки одномерного числового временного ряда через его преобразование в многомерный путем сдвига несколько раз подряд и разложения на несколько ортогональных временных рядов методом главных компонент (PCA). Предложенный метод ранее независимо возникал и применялся на практике под разными названиями (EOF, SSA, Гусеница и т. д.). Оказалось, что он универсальный, применим к любому временному ряду и, не требуя предположения стационарности, автоматически разлагает его на тренд, циклические составляющие и шум. В наши дни чаще всего используется название SSA (сингулярный спектральный анализ). В начале 1980-х гг. Ф. Такенс показал, что для динамической системы сдвиги только одной наблюдаемой переменной позволяют построить аттрактор всей системы, и тем самым подвел под SSA мощную теоретическую базу. Тогда же выяснилась практическая польза фазовых портретов, что было применено, в частности, при анализе и прогнозе динамики численности животных. В настоящей работе предлагается распространить SSA на одномерную последовательность элементов любого типа, включая числа, символы, фигуры и т. д., и в качестве частного случая – на молекулярную последовательность. Технически проблема решается практически тем же алгоритмом, что и SSA. Последовательность режется скользящим окном на фрагменты заданной длины. Между всеми фрагментами вычисляется матрица евклидовых расстояний. Это всегда возможно. Например, квадратный корень из p-дистанции (дистанции Хэмминга) является евклидовым расстоянием. Для полученной матрицы методом главных координат (PCo) вычисляются главные компоненты. Вместо расстояний можно использовать любые индексы сходства/различия и применить методы многомерного шкалирования (MDS). В итоге все равно будут получены главные компоненты в некотором евклидовом пространстве. Мы назвали этот метод PCA-Seq. Это, безусловно, разведочный метод, как и его частный случай SSA. Для любой последовательности, в том числе молекулярной, PCA-Seq без всяких дополнительных предположений позволяет получить ее главные компоненты в числовом виде и визуализировать их в виде графиков и фазовых портретов. Многолетний опыт применения SSA для числовых данных дает все основания полагать, что PCA-Seq окажется не менее полезным при анализе нечисловых данных, особенно при выдвижении гипотез. PCA-Seq реализован в свободно распространяемом пакете Jacobi 4 (http://jacobi4.ru/).</p></abstract><trans-abstract xml:lang="en"><p>In the 1940s, Karhunen and Loève proposed a method for processing a one-dimensional numeric time series by converting it into multidimensional by shifts. In fact, a one-dimensional number series was decomposed into several orthogonal time series. This method has many times been independently developed and applied in practice under various names (EOF, SSA, Caterpillar, etc.). Nowadays, the name ‘SSA’ (Singular Spectral Analysis) is the most often used. It turned out that it is universal, applicable to any time series without requiring stationary assumptions, automatically decomposes time series into a trend, cyclic components and noise. By the beginning of the 1980s, Takens had shown that for a dynamical system such a method makes it possible to obtain an attractor from observing only one of these variables, thereby bringing the method to a powerful theoretical basis. In the same years, the practical benefits of phase portraits became clear. In particular, it was used in the analysis and forecast of animal abundance dynamics. In this paper we propose to extend SSA to a one-dimensional sequence of any type of elements, including numbers, symbols, figures, etc., and, as a special case, to a molecular sequence. Technically, the problem is solved using an algorithm like SSA. The sequence is cut by a sliding window into fragments of a given length. Between all fragments, the matrix of Euclidean distances is calculated. This is always possible. For example, the square root of the Hamming distance between fragments is a Euclidean distance. For the resulting matrix, the principal components are calculated by the principal-coordinate method (PCo). Instead of a distance matrix, one can use a matrix of any similarity/dissimilarity indexes and apply methods of multidimensional scaling (MDS). The result will always be PCs in some Euclidean space. We called this method ‘PCA-Seq’. It is certainly an exploratory method, as is its particular case SSA. For any sequence, in cluding molecular, PCA-Seq without any additional assumptions allows presenting its principal components in a numerical form and visualizing them in the form of phase portraits. A long history of SSA application for numerical data gives all reason to believe that PCA-Seq will be not less useful in the analysis of non-numerical data, especially in hypothesizing. PCA-Seq is implemented in the freely distributed Jacobi 4 package (http://jacobi4.ru/).</p></trans-abstract><kwd-group xml:lang="ru"><kwd>временные ряды</kwd><kwd>SVD</kwd><kwd>PCA</kwd><kwd>PCo</kwd><kwd>MDS</kwd><kwd>SSA</kwd><kwd>молекулярные последовательности</kwd><kwd>p-дистанция</kwd></kwd-group><kwd-group xml:lang="en"><kwd>time series</kwd><kwd>SVD</kwd><kwd>PCA</kwd><kwd>PCo</kwd><kwd>MDS</kwd><kwd>SSA</kwd><kwd>molecular sequences</kwd><kwd>p-distance</kwd></kwd-group><funding-group xml:lang="ru"><funding-statement>Supported by Russian Foundation for Basic Research (# 19-07-00658). The authors are grateful to D.A. Afonnikov, P.N. Menshanov and two anonymous reviewers for useful discussion and constructive comments.</funding-statement></funding-group><funding-group xml:lang="en"><funding-statement>Supported by Russian Foundation for Basic Research (# 19-07-00658). The authors are grateful to D.A. Afonnikov, P.N. Menshanov and two anonymous reviewers for useful discussion and constructive comments.</funding-statement></funding-group></article-meta></front><back><ref-list><title>References</title><ref id="cit1"><label>1</label><citation-alternatives><mixed-citation xml:lang="ru">Efimov V.M., Galaktionov Y.K. On the possibility of predicting cyclic changes in the abundance of mammals. Zhurnal Obshchey Biologii = Journal of General Biology. 1983;3:343-352. (in Russian)</mixed-citation><mixed-citation xml:lang="en">Efimov V.M., Galaktionov Y.K. On the possibility of predicting cyclic changes in the abundance of mammals. Zhurnal Obshchey Biologii = Journal of General Biology. 1983;3:343-352. (in Russian)</mixed-citation></citation-alternatives></ref><ref id="cit2"><label>2</label><citation-alternatives><mixed-citation xml:lang="ru">Efimov V.M., Galaktionov Y.K., Galaktionova T.A. Reconstruction and prognosis of water vole population dynamics on the basis of tularemia morbidity among Novosibirsk oblast residents. Doklady. Biological Sciences. 2003;388(1/6):59-61.</mixed-citation><mixed-citation xml:lang="en">Efimov V.M., Galaktionov Y.K., Galaktionova T.A. Reconstruction and prognosis of water vole population dynamics on the basis of tularemia morbidity among Novosibirsk oblast residents. Doklady. Biological Sciences. 2003;388(1/6):59-61.</mixed-citation></citation-alternatives></ref><ref id="cit3"><label>3</label><citation-alternatives><mixed-citation xml:lang="ru">Efimov V.M., Galaktionov Y.K., Shushpanova N.F. Analysis and Prediction of Time Series by the Principal Component Method. Novosibirsk: Nauka Publ., 1988. (in Russian)</mixed-citation><mixed-citation xml:lang="en">Efimov V.M., Galaktionov Y.K., Shushpanova N.F. Analysis and Prediction of Time Series by the Principal Component Method. Novosibirsk: Nauka Publ., 1988. (in Russian)</mixed-citation></citation-alternatives></ref><ref id="cit4"><label>4</label><citation-alternatives><mixed-citation xml:lang="ru">Efimov V.M., Kovaleva V.Y., Efimov K.V. Principal Component Analysis for any type Sequences (PCA-Seq). In: Mathematical Modeling and High-Performance Computing in Bioinformatics, Biomedicine and Biotechnology (MM-HPC-BBB-2018): Proc. of the 3rd Int. Symp. Novosibirsk, 21–24 Aug 2018. Novosibirsk, 2018;20.</mixed-citation><mixed-citation xml:lang="en">Efimov V.M., Kovaleva V.Y., Efimov K.V. Principal Component Analysis for any type Sequences (PCA-Seq). In: Mathematical Modeling and High-Performance Computing in Bioinformatics, Biomedicine and Biotechnology (MM-HPC-BBB-2018): Proc. of the 3rd Int. Symp. Novosibirsk, 21–24 Aug 2018. Novosibirsk, 2018;20.</mixed-citation></citation-alternatives></ref><ref id="cit5"><label>5</label><citation-alternatives><mixed-citation xml:lang="ru">Efimov V.M., Melchakova M.A., Kovaleva V.Y. Geometric properties of evolutionary distances. Vavilovskii Zhurnal Genetiki i Selektsii = Vavilov Journal of Genetics and Breeding. 2013;17(4/1):714-723. (in Russian)</mixed-citation><mixed-citation xml:lang="en">Efimov V.M., Melchakova M.A., Kovaleva V.Y. Geometric properties of evolutionary distances. Vavilovskii Zhurnal Genetiki i Selektsii = Vavilov Journal of Genetics and Breeding. 2013;17(4/1):714-723. (in Russian)</mixed-citation></citation-alternatives></ref><ref id="cit6"><label>6</label><citation-alternatives><mixed-citation xml:lang="ru">Golyandina N., Korobeynikov A., Zhigljavsky A. Singular Spectrum Analysis with R. (Ser. Use R!) Berlin; Heidelberg: Springer Verlag, 2018.</mixed-citation><mixed-citation xml:lang="en">Golyandina N., Korobeynikov A., Zhigljavsky A. Singular Spectrum Analysis with R. (Ser. Use R!) Berlin; Heidelberg: Springer Verlag, 2018.</mixed-citation></citation-alternatives></ref><ref id="cit7"><label>7</label><citation-alternatives><mixed-citation xml:lang="ru">Golyandina N., Nekrutkin V., Zhigljavsky A.A. Analysis of Time Series Structure: SSA and Related Techniques. Chapman and Hall/CRC, 2001.</mixed-citation><mixed-citation xml:lang="en">Golyandina N., Nekrutkin V., Zhigljavsky A.A. Analysis of Time Series Structure: SSA and Related Techniques. Chapman and Hall/CRC, 2001.</mixed-citation></citation-alternatives></ref><ref id="cit8"><label>8</label><citation-alternatives><mixed-citation xml:lang="ru">Golyandina N., Zhigljavsky A. Singular Spectrum Analysis for Time Series. Springer Science &amp; Business Media, 2013.</mixed-citation><mixed-citation xml:lang="en">Golyandina N., Zhigljavsky A. Singular Spectrum Analysis for Time Series. Springer Science &amp; Business Media, 2013.</mixed-citation></citation-alternatives></ref><ref id="cit9"><label>9</label><citation-alternatives><mixed-citation xml:lang="ru">Gower J.C. Some distance properties of latent root and vector methods used in multivariate analysis. Biometrika. 1966;53(3/4):325-338.</mixed-citation><mixed-citation xml:lang="en">Gower J.C. Some distance properties of latent root and vector methods used in multivariate analysis. Biometrika. 1966;53(3/4):325-338.</mixed-citation></citation-alternatives></ref><ref id="cit10"><label>10</label><citation-alternatives><mixed-citation xml:lang="ru">Jolliffe I.T., Cadima J. Principal component analysis: a review and recent developments. Phil. Trans. R. Soc. A. 2016;374:20150202.</mixed-citation><mixed-citation xml:lang="en">Jolliffe I.T., Cadima J. Principal component analysis: a review and recent developments. Phil. Trans. R. Soc. A. 2016;374:20150202.</mixed-citation></citation-alternatives></ref><ref id="cit11"><label>11</label><citation-alternatives><mixed-citation xml:lang="ru">Karhunen K. Über lineare methoden in der wahrscheinlich-keitsrechnung. Ann. Acad. Sci. Fennicea. 1947;Ser. A137.</mixed-citation><mixed-citation xml:lang="en">Karhunen K. Über lineare methoden in der wahrscheinlich-keitsrechnung. Ann. Acad. Sci. Fennicea. 1947;Ser. A137.</mixed-citation></citation-alternatives></ref><ref id="cit12"><label>12</label><citation-alternatives><mixed-citation xml:lang="ru">Loève M. Fonctions Aléatoires de second order. In: Lévy P. (Ed.). Processus Stochastiques et Movement Brownien. Paris: Hermann, 1948.</mixed-citation><mixed-citation xml:lang="en">Loève M. Fonctions Aléatoires de second order. In: Lévy P. (Ed.). Processus Stochastiques et Movement Brownien. Paris: Hermann, 1948.</mixed-citation></citation-alternatives></ref><ref id="cit13"><label>13</label><citation-alternatives><mixed-citation xml:lang="ru">Polunin D.A., Shtaiger I.A., Efimov V.M. Development of software system JACOBI 4 for multivariate analysis of microarray data. Vestnik Novosibirskogo Gosudarstvennogo Universiteta. Seriya Informatsyonnye Tekhnologii = Vestnik NSU. Information Technology. 2014;12(2):90-98. (in Russian)</mixed-citation><mixed-citation xml:lang="en">Polunin D.A., Shtaiger I.A., Efimov V.M. Development of software system JACOBI 4 for multivariate analysis of microarray data. Vestnik Novosibirskogo Gosudarstvennogo Universiteta. Seriya Informatsyonnye Tekhnologii = Vestnik NSU. Information Technology. 2014;12(2):90-98. (in Russian)</mixed-citation></citation-alternatives></ref><ref id="cit14"><label>14</label><citation-alternatives><mixed-citation xml:lang="ru">Takens F. Detecting strange attractors in turbulence. In: Dynamical Systems and Turbulence. Warwick, 1980. Berlin; Heidelberg: Springer, 1981;366-381.</mixed-citation><mixed-citation xml:lang="en">Takens F. Detecting strange attractors in turbulence. In: Dynamical Systems and Turbulence. Warwick, 1980. Berlin; Heidelberg: Springer, 1981;366-381.</mixed-citation></citation-alternatives></ref></ref-list><fn-group><fn fn-type="conflict"><p>The authors declare that there are no conflicts of interest present.</p></fn></fn-group></back></article>
