@article {10.3844/jcssp.2011.216.224,
article_type = {journal},
title = {Predicting Missing Attribute Values Using k-Means Clustering},
author = {Suguna, Nambiraj and Thanushkodi, Keppana Gowder},
volume = {7},
number = {2},
year = {2011},
month = {Feb},
pages = {216-224},
doi = {10.3844/jcssp.2011.216.224},
url = {https://thescipub.com/abstract/jcssp.2011.216.224},
abstract = {Problem statement: Predicting the value for missing attributes is an important data
preprocessing problem in data mining and knowledge discovery tasks. Several methods have been
proposed to treat missing data and the one used more frequently is deleting instances containing at
least one missing value of a feature. When the dataset has minimum number of missing attribute values
then we can neglect the instances. But if it is high, deleting those instances may neglect the essential
information. Some methods, such as assigning an average value to the missing attribute, assigning the
most common values make good use of all the available data. However the assigned value may not
come from the information which the data originally derived from, thus noise is brought to the data.
Approach: In this study, k-means clustering is proposed for predicting missing attribute values. The
performance of the proposed approach is analyzed with nine different methods. The overall analysis
shows that the k-means clustering can predict the missing attribute values better than other methods.
After assigning the missing attributes, the feature selection is performed with Bees Colony
Optimization (BCO) and the improved Genetic KNN is applied for finding the classification
performance as discussed in our previous study. Results: The performance is analyzed with four
different medical datasets; Dermatology, Cleveland Heart, Lung Cancer and Wisconsin. For all the
datasets, the proposed k-means based missing attribute prediction achieves higher accuracy of 94.60
%, 90.45 %, 87.51 % and 95.70 % respectively. Conclusion: The greater classification accuracy shows
the superior performance of the k-means based missing attribute value prediction.},
journal = {Journal of Computer Science},
publisher = {Science Publications}
}