@article {Echihabi20222005, title = {Hercules Against Data Series Similarity Search}, journal = {Proceedings of the VLDB Endowment}, volume = {15}, number = {10}, year = {2022}, note = {cited By 3}, pages = {2005-2018}, abstract = {We propose Hercules, a parallel tree-based technique for exact similarity search on massive disk-based data series collections. We present novel index construction and query answering algorithms that leverage different summarization techniques, carefully schedule costly operations, optimize memory and disk accesses, and exploit the multi-threading and SIMD capabilities of modern hardware to perform CPU-intensive calculations. We demonstrate the superiority and robustness of Hercules with an extensive experimental evaluation against state-of-the-art techniques, using many synthetic and real datasets, and query workloads of varying difficulty. The results show that Hercules performs up to one order of magnitude faster than the best competitor (which is not always the same). Moreover, Hercules is the only index that outperforms the optimized scan on all scenarios, including the hard query workloads on disk-based datasets. {\textcopyright} 2022, VLDB Endowment., All rights reserved.}, keywords = {CPU-intensive, Data series, Disk-based, Exact similarity searches, Experimental evaluation, Index construction, Multi-threading, Query answering, Similarity search, Tree-based, Trees (mathematics)}, doi = {10.14778/3547305.3547308}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85132784935\&doi=10.14778\%2f3547305.3547308\&partnerID=40\&md5=539332f4b18822ebf450dd3c3f5fda5d}, author = {Echihabi, K. and Fatourou, P. and Zoumpatianos, K. and Palpanas, T. and Benbrahim, H.} } @article {Echihabi2020402, title = {Return of the lernaean hydra: Experimental evaluation of data series approximate similarity search}, journal = {Proceedings of the VLDB Endowment}, volume = {13}, number = {3}, year = {2020}, note = {cited By 25}, pages = {402-419}, abstract = {Data series are a special type of multidimensional data present in numerous domains, where similarity search is a key operation that has been extensively studied in the data series literature. In parallel, the multidimensional community has studied approximate similarity search techniques. We propose a taxonomy of similarity search techniques that reconciles the terminology used in these two domains, we describe modifications to data series indexing techniques enabling them to answer approximate similarity queries with quality guarantees, and we conduct a thorough experimental evaluation to compare approximate similarity search techniques under a unified framework, on synthetic and real datasets in memory and on disk. Although data series differ from generic multidimensional vectors (series usually exhibit correlation between neighboring values), our results show that data series techniques answer approximate queries with strong guarantees and an excellent empirical performance, on data series and vectors alike. These techniques outperform the state-of-the-art approximate techniques for vectors when operating on disk, and remain competitive in memory.}, keywords = {Approximate query, Empirical performance, Experimental evaluation, Indexing techniques, Multi-dimensional vectors, Multidimensional data, Quality control, Query processing, Similarity search, Unified framework}, doi = {10.14778/3368289.3368303}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85092075628\&doi=10.14778\%2f3368289.3368303\&partnerID=40\&md5=c84591d6c7c3001ccc91a3584e71e4ff}, author = {Echihabi, K. and Zoumpatianos, K. and Palpanas, T. and Benbrahim, H.} } @conference {Echihabi20201, title = {Scalable Machine Learning on High-Dimensional Vectors: From Data Series to Deep Network Embeddings}, booktitle = {ACM International Conference Proceeding Series}, volume = {Part F162565}, year = {2020}, note = {cited By 0}, pages = {1-6}, abstract = {There is an increasingly pressing need, by several applications in diverse domains, for developing techniques able to analyze very large collections of static and streaming sequences (a.k.a. data series), predominantly in real-time. Examples of such applications come from Internet of Things installations, neuroscience, astrophysics, and a multitude of other scientific and application domains that need to apply machine learning techniques for knowledge extraction. It is not unusual for these applications, for which similarity search is a core operation, to involve numbers of data series in the order of hundreds of millions to billions, which are seldom analyzed in their full detail due to their sheer size. Such application requirements have driven the development of novel similarity search methods that can facilitate scalable analytics in this context. At the same time, a host of other methods have been developed for similarity search of high-dimensional vectors in general. All these methods are now becoming increasingly important, because of the growing popularity and size of sequence collections, as well as the growing use of high-dimensional vector representations of a large variety of objects (such as text, multimedia, images, audio and video recordings, graphs, database tables, and others) thanks to deep network embeddings. In this work, we review recent efforts in designing techniques for indexing and analyzing massive collections of data series, and argue that they are the methods of choice even for general high-dimensional vectors. Finally, we discuss the challenges and open research problems in this area. {\textcopyright} 2020 Owner/Author.}, keywords = {Application requirements, Astrophysics, Deep learning, Designing techniques, Embeddings, High-dimensional, Intelligent systems, Knowledge extraction, Learning systems, Machine learning techniques, Real time systems, Research problems, Scalable machine learning, Semantics, Similarity search, Vectors, Video recording}, doi = {10.1145/3405962.3405989}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85091520891\&doi=10.1145\%2f3405962.3405989\&partnerID=40\&md5=b3b9f8c365e940d7a488d9a9a20df419}, author = {Echihabi, K. and Zoumpatianos, K. and Palpanas, T.} } @conference {Echihabi2018112, title = {The lernaean hydra of data series similarity search: An experimental evaluation of the state of the art}, booktitle = {Proceedings of the VLDB Endowment}, volume = {12}, number = {2}, year = {2018}, pages = {112-127}, doi = {10.14778/3282495.3282498}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85061729182\&doi=10.14778\%2f3282495.3282498\&partnerID=40\&md5=1b5df3c860cb0457dac9c0facb8d6f52}, author = {Echihabi, K. and Zoumpatianos, K. and Palpanas, T. and Benbrahim, H.} }