@article {Benhar2022791, title = {Univariate and Multivariate Filter Feature Selection for Heart Disease Classification}, journal = {Journal of Information Science and Engineering}, volume = {38}, number = {4}, year = {2022}, note = {cited By 0}, pages = {791-803}, abstract = {Feature selection (FS) is a data preprocessing task that can be applied before the classification phase, and aims at improving the performance and interpretability of classifiers by finding only a few highly informative features. The present study aims at evaluating and comparing the performances of six univariate and two multivariate filter FS techniques for heart disease classification. The FS techniques were evaluated with two white-box and two black-box classification techniques using five heart disease datasets. Furthermore, this study deals with the setting of the hyperparameters{\textquoteright} values of the four classifiers. This study evaluates 600 variants of classifiers. Results show that white-box classification techniques such as K-Nearest Neighbors and Decision Trees can be very competitive with black-box ones when hyperparameters{\textquoteright} optimization and feature selection were applied. {\textcopyright} 2022 Institute of Information Science. All rights reserved.}, keywords = {Black boxes, Cardiology, Classification (of information), Data preprocessing, Decision trees, disease classification, Diseases, feature selection, Features selection, Filter, Heart, heart disease, Nearest neighbor search, Performance, Selection techniques, Univariate, White box}, doi = {10.6688/JISE.202207_38(4).0006}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85137170374\&doi=10.6688\%2fJISE.202207_38\%284\%29.0006\&partnerID=40\&md5=f75bca8ff78e6782d4c11e88d338784f}, author = {Benhar, H. and Hosni, M. and Idri, A.} } @article {Hosni20212827, title = {A systematic mapping study for ensemble classification methods in cardiovascular disease}, journal = {Artificial Intelligence Review}, volume = {54}, number = {4}, year = {2021}, note = {cited By 5}, pages = {2827-2861}, abstract = {Ensemble methods overcome the limitations of single machine learning techniques by combining different techniques, and are employed in the quest to achieve a high level of accuracy. This approach has been investigated in various fields, one of them being that of bioinformatics. One of the most frequent applications of ensemble techniques involves research into cardiovascular diseases, which are considered the leading cause of death worldwide. The purpose of this research work is to identify the papers that investigate ensemble classification techniques applied to cardiology diseases, and to analyse them according to nine aspects: their publication venues, the medical tasks tackled, the empirical and research types adopted, the types of ensembles proposed, the single techniques used to construct the ensembles, the validation frameworks adopted to evaluate the proposed ensembles, the tools used to build the ensembles, and the optimization methods employed for the single techniques. This paper reports the carrying out of a systematic mapping study. An extensive automatic search in four digital libraries: IEEE Xplore, ACM Digital Library, PubMed, and Scopus, followed by a study selection process, resulted in the identification of 351 papers that were used to address our mapping questions. This study found that the papers selected had been published in a large number of different resources. The medical task addressed most frequently by the selected studies was diagnosis. In addition, the experiment-based empirical type and evaluation-based research type were the most dominant approaches adopted by the selected studies. Homogeneous ensembles were the ensemble type that was developed most often in literature, while decision trees, artificial neural networks and Bayesian classifiers were the single techniques used most frequently to develop ensemble classification methods. The weighted majority and majority voting rules were adopted to obtain the final decision of the ensembles developed. With regard to evaluation frameworks, the datasets obtained from the UCI and PhysioBank repositories were those used most often to evaluate the ensemble methods, while the k-fold cross-validation method was the most frequently-employed validation technique. Several tools with which to build ensemble classifiers were identified, and the type of software adopted with the greatest frequency was open source. Finally, only a few researchers took into account the optimization of the parameter settings of either single or meta ensemble classifiers. This mapping study attempts to provide a greater insight into the application of ensemble classification methods in cardiovascular diseases. The majority of the selected papers reported positive feedback as regards the ability of ensemble methods to perform better than single methods. Further analysis is required to aggregate the evidence reported in literature. {\textcopyright} 2020, Springer Nature B.V.}, keywords = {Bayesian networks, Cardio-vascular disease, Cardiology, Decision trees, Diagnosis, Digital libraries, Diseases, Ensemble classification, Ensemble classifiers, Evaluation framework, K fold cross validations, Learning systems, Majority voting rules, Mapping, Open source software, Open systems, Optimization method, Systematic mapping studies}, doi = {10.1007/s10462-020-09914-6}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85091735819\&doi=10.1007\%2fs10462-020-09914-6\&partnerID=40\&md5=69ea4b02de420c3ec6a85e1f3c7dddaf}, author = {Hosni, M. and Carrillo de Gea, J.M. and Idri, A. and El Bajta, M. and Fern{\'a}ndez Alem{\'a}n, J.L. and Garc{\'\i}a-Mateos, G. and Abnane, I.} } @article {BENHAR2020105635, title = {Data preprocessing for heart disease classification: A systematic literature review}, journal = {Computer Methods and Programs in Biomedicine}, volume = {195}, year = {2020}, pages = {105635}, abstract = {Context Early detection of heart disease is an important challenge since 17.3 million people yearly lose their lives due to heart diseases. Besides, any error in diagnosis of cardiac disease can be dangerous and risks an individual{\textquoteright}s life. Accurate diagnosis is therefore critical in cardiology. Data Mining (DM) classification techniques have been used to diagnosis heart diseases but still limited by some challenges of data quality such as inconsistencies, noise, missing data, outliers, high dimensionality and imbalanced data. Data preprocessing (DP) techniques were therefore used to prepare data with the goal of improving the performance of heart disease DM based prediction systems. Objective The purpose of this study is to review and summarize the current evidence on the use of preprocessing techniques in heart disease classification as regards: (1) the DP tasks and techniques most frequently used, (2) the impact of DP tasks and techniques on the performance of classification in cardiology, (3) the overall performance of classifiers when using DP techniques, and (4) comparisons of different combinations classifier-preprocessing in terms of accuracy rate. Method A systematic literature review is carried out, by identifying and analyzing empirical studies on the application of data preprocessing in heart disease classification published in the period between January 2000 and June 2019. A total of 49 studies were therefore selected and analyzed according to the aforementioned criteria. Results The review results show that data reduction is the most used preprocessing task in cardiology, followed by data cleaning. In general, preprocessing either maintained or improved the performance of heart disease classifiers. Some combinations such as (ANN~+~PCA), (ANN~+~CHI) and (SVM~+~PCA) are promising terms of accuracy. However the deployment of these models in real-world diagnosis decision support systems is subject to several risks and limitations due to the lack of interpretation.}, keywords = {Cardiac datasets, Cardiology, Data preprocessing, Datamining, Literature review}, issn = {0169-2607}, doi = {https://doi.org/10.1016/j.cmpb.2020.105635}, url = {https://www.sciencedirect.com/science/article/pii/S0169260720314681}, author = {H. Benhar and A. Idri and J.L. Fern{\'a}ndez-Alem{\'a}n} } @article {Benhar2020, title = {Data preprocessing for heart disease classification: A systematic literature review.}, journal = {Computer Methods and Programs in Biomedicine}, volume = {195}, year = {2020}, note = {cited By 25}, abstract = {Context: Early detection of heart disease is an important challenge since 17.3 million people yearly lose their lives due to heart diseases. Besides, any error in diagnosis of cardiac disease can be dangerous and risks an individual{\textquoteright}s life. Accurate diagnosis is therefore critical in cardiology. Data Mining (DM) classification techniques have been used to diagnosis heart diseases but still limited by some challenges of data quality such as inconsistencies, noise, missing data, outliers, high dimensionality and imbalanced data. Data preprocessing (DP) techniques were therefore used to prepare data with the goal of improving the performance of heart disease DM based prediction systems. Objective: The purpose of this study is to review and summarize the current evidence on the use of preprocessing techniques in heart disease classification as regards: (1) the DP tasks and techniques most frequently used, (2) the impact of DP tasks and techniques on the performance of classification in cardiology, (3) the overall performance of classifiers when using DP techniques, and (4) comparisons of different combinations classifier-preprocessing in terms of accuracy rate. Method: A systematic literature review is carried out, by identifying and analyzing empirical studies on the application of data preprocessing in heart disease classification published in the period between January 2000 and June 2019. A total of 49 studies were therefore selected and analyzed according to the aforementioned criteria. Results: The review results show that data reduction is the most used preprocessing task in cardiology, followed by data cleaning. In general, preprocessing either maintained or improved the performance of heart disease classifiers. Some combinations such as (ANN + PCA), (ANN + CHI) and (SVM + PCA) are promising terms of accuracy. However the deployment of these models in real-world diagnosis decision support systems is subject to several risks and limitations due to the lack of interpretation. {\textcopyright} 2020 Elsevier B.V.}, keywords = {Cardiology, Classification (of information), Classification technique, classifier, clinical practice, clinical research, Computer aided diagnosis, data classification, Data mining, Data preprocessing, data processing, Decision support systems, Deep learning, Diagnosis decision, diagnostic accuracy, disease classification, Diseases, empiricism, evidence based practice, feature selection, Heart, heart disease, Heart Diseases, High dimensionality, human, Humans, intermethod comparison, Machine learning, Performance of classifier, prediction, Prediction systems, Preprocessing techniques, publication, Review, Support vector machines, Systematic literature review, Systematic Review, task performance}, doi = {10.1016/j.cmpb.2020.105635}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85087500300\&doi=10.1016\%2fj.cmpb.2020.105635\&partnerID=40\&md5=cae53ce36903d5d8b817ec96deb39b45}, author = {Benhar, H. and Idri, A. and L Fern{\'a}ndez-Alem{\'a}n, J.} } @conference {Benhar2020391, title = {Impact of threshold values for filter-based univariate feature selection in heart disease classification}, booktitle = {HEALTHINF 2020 - 13th International Conference on Health Informatics, Proceedings; Part of 13th International Joint Conference on Biomedical Engineering Systems and Technologies, BIOSTEC 2020}, year = {2020}, note = {cited By 2}, pages = {391-398}, abstract = {In the last decade, feature selection (FS), was one of the most investigated preprocessing tasks for heart disease prediction. Determining the optimal features which contribute more towards the diagnosis of heart disease can reduce the number of clinical tests needed to be taken by a patient, decrease the model cost, reduce the storage requirements and improve the comprehensibility of the induced model. In this study a comparison of three filter feature ranking methods was carried out. Feature ranking methods need to set a threshold (i.e. the percentage of the number of relevant features to be selected) in order to select the final subset of features. Thus, the aim of this study is to investigate if there is a threshold value which is an optimal choice for three different feature ranking methods and four classifiers used for heart disease classification in four heart disease datasets. The used feature ranking methods and selection thresholds resulted in optimal classification performance for one or more classifiers over small and large heart disease datasets. The size of the dataset takes an important role in the choice of the selection threshold. {\textcopyright} 2020 by SCITEPRESS - Science and Technology Publications, Lda. All rights reserved.}, keywords = {Biomedical engineering, Cardiology, Classification (of information), Clinical tests, Diagnosis, Diseases, Feature extraction, Feature ranking, Heart, heart disease, Large dataset, Medical informatics, Optimal choice, Optimal classification, Relevant features, Storage requirements, Threshold-value}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85083712586\&partnerID=40\&md5=4656d8b952f7c60387d4495c737c5a6d}, author = {Benhar, H. and Idri, A. and Hosni, M.} } @article {Sanak2020203, title = {MARCO Gene Variations and Their Association with Cardiovascular Diseases Development: An In-Silico Analysis}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, volume = {12108 LNBI}, year = {2020}, note = {cited By 0}, pages = {203-212}, abstract = {Cardiovascular diseases (CVDs) represent the leading cause of morbidity and mortality in both developed and developing countries. They have complex etiology, influenced by several risk factors including the genetic component. The genetic variations were shown to be highly associated with different CVD forms, in this objective we proceeded to analyze the Macrophage Receptor with Collagen structure gene (MARCO), we performed an in-silico study with a genomic functional analysis, to evaluate the mutations{\textquoteright} effects on the proteins{\textquoteright} structures and functionalities. Indeed, we used dbSNP to retrieve single nucleotide polymorphisms (SNPs) of MARCO gene. We proceeded then to a filtration and a stability analysis using several bioinformatics tools to evaluate the most deleterious variations. Moreover we predicted the 3D structures of the encoded proteins by MARCO gene, which was validated using PROCHECK. Then we analyzed and visualize the proteins{\textquoteright} 3D structures. The extraction of the human MARCO gene SNPs revealed that dbSNP contains more than 14000 SNPs. The filtration process revealed the variations G241V and G262W to be the most deleterious SNPs, indeed, I-Mutant and DUET showed decreased protein stability. The validation using PROCHECK revealed a total of 89.9\% MARCO protein residues to be in the favored region. As conclusion, our results let suggesting that G241V and G262W variations can cause alteration in the proteins{\textquoteright} structures and functions. Hence, to improve the health management, screening precariously these variants, can be useful as model for CVD diagnosis and helpful in pharmacogenomics. {\textcopyright} Springer Nature Switzerland AG 2020.}, keywords = {Bioinformatics, Bioinformatics tools, Biomedical engineering, Cardio-vascular disease, Cardiology, Collagen structure, Developing countries, Diagnosis, Diseases, Filtration process, Genes, Genetic components, Genetic variation, Health management, Proteins, Single nucleotide polymorphisms}, doi = {10.1007/978-3-030-45385-5_19}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85085196738\&doi=10.1007\%2f978-3-030-45385-5_19\&partnerID=40\&md5=1df531cdc3510e8747834a13a996b1b1}, author = {Sanak, K. and Azzouzi, M. and Abik, M. and Radouani, F.} }