@inproceedings{d0752960437e48f3ae8043147cc7de48,
title = "Semiparametric Subsampling and Data Condensation for Large-Scale Data Analytics",
abstract = "Subsampling is often used to reduce the complexity of large datasets. However, such methods need to ensure that the subsampled data are representative of the original dataset. Here, we introduce a new clustering-based data condensation (subsampling) framework for large datasets. The framework relies on the use of stratified sampling, Voronoi diagrams, and variational Bayes-based Gaussian mixture clustering. We tested the proposed framework on three large imbalanced benchmark datasets, namely cod-RNA, ds1.10, and ds1.100. The efficiency and generality of the proposed framework were assessed by comparing the predictive performance of the reduced datasets with the original datasets over two machine-learning classifiers, namely the random forest, and the radial basis function network. The evaluation metrics included the accuracy, F-measure and reduction percentage. We found that very high reduction percentages can be achieved using our new framework while maintaining satisfactory predictive performance.",
keywords = "clustering, Data Condensation, machine learning, subsampling",
author = "Omar Alhussein and Yoo, {Paul D.} and Sami Muhaidat and Jie Liang",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 2019 IEEE Canadian Conference of Electrical and Computer Engineering, CCECE 2019 ; Conference date: 05-05-2019 Through 08-05-2019",
year = "2019",
month = may,
doi = "10.1109/CCECE.2019.8861966",
language = "British English",
series = "2019 IEEE Canadian Conference of Electrical and Computer Engineering, CCECE 2019",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "2019 IEEE Canadian Conference of Electrical and Computer Engineering, CCECE 2019",
address = "United States",
}