deedrop1140's picture
Upload 41 files
0d00d62 verified
{
"title": "K-Means Clustering Mastery: 100 MCQs",
"description": "A comprehensive set of 100 multiple-choice questions designed to test and deepen your understanding of K-Means Clustering, covering basic concepts, algorithm steps, practical scenarios, and challenges in high-dimensional or real-world datasets.",
"questions": [
{
"id": 1,
"questionText": "What is the primary goal of K-Means Clustering?",
"options": [
"Partition data into K clusters minimizing within-cluster variance",
"Reduce the dimensionality of the dataset",
"Detect outliers in the dataset",
"Classify data into predefined categories"
],
"correctAnswerIndex": 0,
"explanation": "K-Means aims to divide data into K clusters such that the sum of squared distances between points and their cluster centroid is minimized."
},
{
"id": 2,
"questionText": "In K-Means, what does a 'centroid' represent?",
"options": [
"A random point from the dataset",
"The farthest point from the cluster",
"The maximum value in the cluster",
"The mean position of all points in the cluster"
],
"correctAnswerIndex": 3,
"explanation": "Centroid is the mean of all points in a cluster and represents the cluster's center."
},
{
"id": 3,
"questionText": "Which step is repeated in K-Means until convergence?",
"options": [
"Compute correlation matrix",
"Assign points to nearest centroid and update centroids",
"Remove outliers",
"Randomly shuffle data points"
],
"correctAnswerIndex": 1,
"explanation": "K-Means iteratively assigns points to the nearest centroid and recalculates centroids until assignments stabilize."
},
{
"id": 4,
"questionText": "Scenario: K-Means converges but clusters are uneven in size. Likely reason?",
"options": [
"Centroids are incorrect",
"Distance metric used is Euclidean",
"Data distribution is skewed",
"Algorithm failed"
],
"correctAnswerIndex": 2,
"explanation": "K-Means partitions based on distances; skewed or non-spherical distributions can lead to uneven cluster sizes."
},
{
"id": 5,
"questionText": "What is the main limitation of K-Means clustering?",
"options": [
"Sensitive to outliers",
"Requires predefined number of clusters (K)",
"Only works for numerical data",
"All of the above"
],
"correctAnswerIndex": 3,
"explanation": "K-Means works only with numerical data, needs K as input, and is sensitive to outliers."
},
{
"id": 6,
"questionText": "Scenario: K-Means applied to customer locations. Distance metric to use?",
"options": [
"Euclidean distance",
"Hamming distance",
"Cosine similarity",
"Jaccard index"
],
"correctAnswerIndex": 0,
"explanation": "Euclidean distance is standard for K-Means and spatial numerical data."
},
{
"id": 7,
"questionText": "Scenario: K-Means on 2D points results vary with different initial centroids. Solution?",
"options": [
"Use hierarchical clustering instead",
"Ignore initial centroids",
"Reduce K",
"Use K-Means++ initialization"
],
"correctAnswerIndex": 3,
"explanation": "K-Means++ selects better initial centroids to improve convergence and consistency."
},
{
"id": 8,
"questionText": "Scenario: K-Means on concentric circles fails. Reason?",
"options": [
"K-Means assumes spherical clusters",
"Data contains outliers",
"Distance metric wrong",
"Algorithm converged too quickly"
],
"correctAnswerIndex": 0,
"explanation": "K-Means works best for convex, spherical clusters; it cannot separate concentric circular clusters."
},
{
"id": 9,
"questionText": "Scenario: After K-Means clustering, silhouette score is low. Interpretation?",
"options": [
"Noise ignored automatically",
"Clusters overlap or poorly defined",
"Clusters are perfect",
"Algorithm converged correctly"
],
"correctAnswerIndex": 1,
"explanation": "Low silhouette score indicates points are close to neighboring cluster centroids; clusters are not well separated."
},
{
"id": 10,
"questionText": "Scenario: Large dataset with millions of points. K-Means limitation?",
"options": [
"Algorithm fails completely",
"Distance metric is irrelevant",
"Cannot calculate centroids",
"Convergence can be slow; consider Mini-Batch K-Means"
],
"correctAnswerIndex": 3,
"explanation": "Mini-Batch K-Means is a faster variant suitable for large datasets."
},
{
"id": 11,
"questionText": "Scenario: K-Means on a dataset with outliers. Effect?",
"options": [
"Algorithm removes outliers",
"Centroids can shift towards outliers, distorting clusters",
"Clusters become more compact",
"Clusters ignore outliers automatically"
],
"correctAnswerIndex": 1,
"explanation": "Outliers can disproportionately affect centroids, leading to poorly defined clusters."
},
{
"id": 12,
"questionText": "Scenario: K-Means on categorical data. Limitation?",
"options": [
"K-Means requires numerical data; cannot handle categorical directly",
"Clusters merge randomly",
"Categorical data improves clustering",
"Algorithm automatically encodes categories"
],
"correctAnswerIndex": 0,
"explanation": "K-Means relies on distance metrics, which are not directly defined for categorical data."
},
{
"id": 13,
"questionText": "Scenario: Selecting K for K-Means. Which method helps?",
"options": [
"Merge dendrograms",
"Random selection",
"Elbow method",
"Silhouette ignored"
],
"correctAnswerIndex": 2,
"explanation": "The Elbow method plots sum of squared errors vs K and identifies an 'elbow' point as optimal K."
},
{
"id": 14,
"questionText": "Scenario: K-Means fails to separate overlapping clusters. Likely reason?",
"options": [
"K too small",
"Centroids are optimal",
"Algorithm converged correctly",
"Clusters are not well-separated or non-convex"
],
"correctAnswerIndex": 3,
"explanation": "K-Means works best for well-separated convex clusters; overlapping clusters are challenging."
},
{
"id": 15,
"questionText": "Scenario: K-Means with high-dimensional data. Challenge?",
"options": [
"Algorithm fails automatically",
"Noise ignored",
"Distance metrics lose meaning; consider PCA or feature scaling",
"Clusters are always compact"
],
"correctAnswerIndex": 2,
"explanation": "High-dimensional spaces dilute distances; dimensionality reduction improves clustering."
},
{
"id": 16,
"questionText": "Scenario: K-Means with K too large. Effect?",
"options": [
"Algorithm automatically reduces K",
"Clusters always improve",
"Clusters may become small and meaningless",
"Noise ignored"
],
"correctAnswerIndex": 2,
"explanation": "Choosing K too large can lead to many tiny clusters with no meaningful pattern."
},
{
"id": 17,
"questionText": "Scenario: Mini-Batch K-Means. Advantage?",
"options": [
"Faster for large datasets with approximate centroids",
"Removes noise automatically",
"More accurate than standard K-Means",
"Works only on small datasets"
],
"correctAnswerIndex": 0,
"explanation": "Mini-Batch K-Means updates centroids using small random batches for efficiency on large datasets."
},
{
"id": 18,
"questionText": "Scenario: K-Means initialization affects results. Solution?",
"options": [
"Use single random centroid only",
"Run algorithm multiple times or use K-Means++",
"Ignore initialization",
"Reduce K randomly"
],
"correctAnswerIndex": 1,
"explanation": "K-Means++ and multiple runs improve stability and reduce sensitivity to initial centroids."
},
{
"id": 19,
"questionText": "Scenario: K-Means distance metric. Standard choice?",
"options": [
"Euclidean distance",
"Jaccard index",
"Hamming distance",
"Cosine similarity"
],
"correctAnswerIndex": 0,
"explanation": "K-Means typically uses Euclidean distance to assign points to nearest centroids."
},
{
"id": 20,
"questionText": "Scenario: K-Means clustering produces empty clusters. Cause?",
"options": [
"Algorithm failed",
"Clusters are compact",
"No points assigned to some centroids",
"Centroids are optimal"
],
"correctAnswerIndex": 2,
"explanation": "Some centroids may not attract any points, leading to empty clusters."
},
{
"id": 21,
"questionText": "Scenario: K-Means on text embeddings. Preprocessing required?",
"options": [
"Use categorical K-Means directly",
"No preprocessing needed",
"Randomly assign clusters",
"Normalize or scale vectors before clustering"
],
"correctAnswerIndex": 3,
"explanation": "Text embeddings often need normalization to prevent certain dimensions from dominating distance computations."
},
{
"id": 22,
"questionText": "Scenario: K-Means with very high K. Effect on SSE (sum of squared errors)?",
"options": [
"SSE increases",
"SSE is ignored",
"SSE decreases as K increases",
"SSE remains constant"
],
"correctAnswerIndex": 2,
"explanation": "As K increases, each cluster contains fewer points, reducing the sum of squared distances."
},
{
"id": 23,
"questionText": "Scenario: K-Means on scaled vs unscaled features. Effect?",
"options": [
"Clusters merge randomly",
"Scaling is important; features with large range dominate clustering",
"Algorithm fails if not scaled",
"Scaling is unnecessary"
],
"correctAnswerIndex": 1,
"explanation": "Features with larger numerical ranges can dominate Euclidean distance; scaling ensures fair contribution."
},
{
"id": 24,
"questionText": "Scenario: K-Means convergence criteria. Standard check?",
"options": [
"Centroid positions or cluster assignments stop changing",
"Distance metric ignored",
"Random stopping",
"Maximum iterations only"
],
"correctAnswerIndex": 0,
"explanation": "Algorithm stops when centroids or cluster assignments stabilize, or after a max number of iterations."
},
{
"id": 25,
"questionText": "Scenario: K-Means for image compression. How?",
"options": [
"Cluster pixel colors and replace each pixel by its centroid",
"Use hierarchical clustering",
"Remove noise automatically",
"Reduce image resolution"
],
"correctAnswerIndex": 0,
"explanation": "K-Means clusters similar colors, allowing image compression by using cluster centroids as representative colors."
},
{
"id": 26,
"questionText": "Scenario: K-Means for customer segmentation. Benefit?",
"options": [
"Identify customer groups for targeted marketing",
"Automatically predicts sales",
"Detects trends over time",
"Removes outliers"
],
"correctAnswerIndex": 0,
"explanation": "K-Means helps segment customers based on behavior, allowing targeted campaigns."
},
{
"id": 27,
"questionText": "Scenario: K-Means clustering results differ on repeated runs. Cause?",
"options": [
"Distance metric varies",
"Random initialization of centroids",
"Algorithm deterministic",
"Clusters merge randomly"
],
"correctAnswerIndex": 1,
"explanation": "Random initial centroids can lead to different final clusters; K-Means++ mitigates this."
},
{
"id": 28,
"questionText": "Scenario: K-Means for anomaly detection. Approach?",
"options": [
"Clusters merge randomly",
"Points far from nearest centroid may be anomalies",
"All points treated equally",
"Noise automatically ignored"
],
"correctAnswerIndex": 1,
"explanation": "Outliers are detected as points distant from cluster centroids."
},
{
"id": 29,
"questionText": "Scenario: K-Means clustering on geospatial data. Best practice?",
"options": [
"Use distance metric appropriate for coordinates (e.g., haversine)",
"Randomly assign clusters",
"Use Euclidean blindly",
"Clusters merge arbitrarily"
],
"correctAnswerIndex": 0,
"explanation": "Euclidean distance may misrepresent geographic distances; use geodesic metrics like haversine."
},
{
"id": 30,
"questionText": "Scenario: K-Means with highly correlated features. Solution?",
"options": [
"Increase K randomly",
"Apply PCA to reduce correlated dimensions",
"Ignore correlations",
"Clusters merge arbitrarily"
],
"correctAnswerIndex": 1,
"explanation": "PCA reduces correlated features and improves clustering performance."
},
{
"id": 31,
"questionText": "Scenario: K-Means on non-spherical clusters. Limitation?",
"options": [
"Noise ignored",
"Clusters are always compact",
"Algorithm automatically adapts",
"K-Means assumes spherical clusters; non-spherical clusters may be poorly separated"
],
"correctAnswerIndex": 3,
"explanation": "K-Means relies on Euclidean distance and assumes roughly spherical clusters, so elongated or irregular clusters are not well captured."
},
{
"id": 32,
"questionText": "Scenario: K-Means clustering produces clusters with very different densities. Challenge?",
"options": [
"Distance metric ignored",
"Clusters always equal",
"Low-density clusters may be merged incorrectly",
"Algorithm detects densities automatically"
],
"correctAnswerIndex": 2,
"explanation": "K-Means does not account for density; clusters with differing densities may not be separated properly."
},
{
"id": 33,
"questionText": "Scenario: K-Means applied to text embeddings. Best practice?",
"options": [
"Increase K arbitrarily",
"Normalize embeddings to unit vectors before clustering",
"Remove half the features randomly",
"Use raw embeddings"
],
"correctAnswerIndex": 1,
"explanation": "Normalization ensures that distance computation reflects angle similarity rather than magnitude differences."
},
{
"id": 34,
"questionText": "Scenario: K-Means clustering with missing values. Approach?",
"options": [
"Ignore missing values",
"Randomly assign missing values",
"Algorithm automatically handles them",
"Impute missing values before clustering"
],
"correctAnswerIndex": 3,
"explanation": "K-Means requires complete numerical data; missing values should be imputed or removed."
},
{
"id": 35,
"questionText": "Scenario: K-Means clustering on multi-dimensional customer features. Preprocessing step?",
"options": [
"Randomly drop features",
"Scale features so all dimensions contribute equally",
"Increase K arbitrarily",
"Leave features unscaled"
],
"correctAnswerIndex": 1,
"explanation": "Feature scaling ensures that dimensions with larger ranges do not dominate Euclidean distance."
},
{
"id": 36,
"questionText": "Scenario: K-Means on a dataset with outliers. Solution?",
"options": [
"Use standard K-Means without changes",
"Randomly assign clusters",
"Increase K to compensate",
"Remove or preprocess outliers before clustering"
],
"correctAnswerIndex": 3,
"explanation": "Outliers can distort centroids; preprocessing improves clustering accuracy."
},
{
"id": 37,
"questionText": "Scenario: K-Means convergence too slow. Solution?",
"options": [
"Ignore convergence",
"Change distance metric arbitrarily",
"Use Mini-Batch K-Means or reduce dataset size",
"Increase K randomly"
],
"correctAnswerIndex": 2,
"explanation": "Mini-Batch K-Means or subsampling speeds up convergence for large datasets."
},
{
"id": 38,
"questionText": "Scenario: K-Means clustering with highly correlated features. Best approach?",
"options": [
"Increase K",
"Ignore correlation",
"Merge clusters arbitrarily",
"Apply PCA or feature selection to reduce redundancy"
],
"correctAnswerIndex": 3,
"explanation": "Reducing correlated dimensions prevents redundant information from biasing distance calculations."
},
{
"id": 39,
"questionText": "Scenario: K-Means clustering on skewed data. Issue?",
"options": [
"Noise ignored",
"Algorithm corrects automatically",
"Clusters may be biased towards dense regions",
"Clusters always balanced"
],
"correctAnswerIndex": 2,
"explanation": "Skewed distributions can lead to unequal cluster sizes or poorly defined boundaries."
},
{
"id": 40,
"questionText": "Scenario: K-Means with K unknown. Methods to select K?",
"options": [
"Random choice",
"Algorithm decides automatically",
"Use maximum data points",
"Elbow method, silhouette score, gap statistic"
],
"correctAnswerIndex": 3,
"explanation": "These methods help determine optimal K by evaluating clustering performance."
},
{
"id": 41,
"questionText": "Scenario: K-Means produces very similar clusters on repeated runs. Possible reason?",
"options": [
"Algorithm converged incorrectly",
"Data naturally forms stable clusters",
"Distance metric is wrong",
"Initialization randomization failed"
],
"correctAnswerIndex": 1,
"explanation": "If data has well-separated clusters, K-Means results are stable across runs."
},
{
"id": 42,
"questionText": "Scenario: K-Means on a small dataset with large K. Risk?",
"options": [
"Algorithm fails completely",
"Centroids ignored",
"Clusters may be too small or empty",
"Clusters automatically merge"
],
"correctAnswerIndex": 2,
"explanation": "Too many clusters for small datasets can produce meaningless or empty clusters."
},
{
"id": 43,
"questionText": "Scenario: K-Means++ initialization. Benefit?",
"options": [
"Improves cluster quality by selecting distant initial centroids",
"Random initialization",
"Always produces identical clusters",
"Removes noise automatically"
],
"correctAnswerIndex": 0,
"explanation": "K-Means++ reduces poor initialization by spreading centroids apart."
},
{
"id": 44,
"questionText": "Scenario: K-Means with categorical features. Solution?",
"options": [
"Use K-Prototypes or encode categories numerically",
"Clusters merge randomly",
"Ignore categorical data",
"Use standard K-Means directly"
],
"correctAnswerIndex": 0,
"explanation": "Standard K-Means cannot handle categorical data; K-Prototypes or encoding is needed."
},
{
"id": 45,
"questionText": "Scenario: K-Means on noisy sensor data. Best practice?",
"options": [
"Use raw data",
"Increase K arbitrarily",
"Filter or preprocess noise before clustering",
"Ignore convergence"
],
"correctAnswerIndex": 2,
"explanation": "Noise affects centroids and cluster assignment; preprocessing improves results."
},
{
"id": 46,
"questionText": "Scenario: K-Means for image segmentation. Metric for colors?",
"options": [
"Cosine similarity",
"Euclidean distance in RGB or LAB space",
"Hamming distance",
"Jaccard index"
],
"correctAnswerIndex": 1,
"explanation": "Euclidean distance is standard for numerical pixel features in color space."
},
{
"id": 47,
"questionText": "Scenario: K-Means convergence to local minimum. Reason?",
"options": [
"Poor initialization of centroids",
"Algorithm always finds global minimum",
"Clusters are too compact",
"Distance metric is incorrect"
],
"correctAnswerIndex": 0,
"explanation": "Random initial centroids can lead K-Means to converge to suboptimal local minima."
},
{
"id": 48,
"questionText": "Scenario: K-Means clustering with overlapping clusters. Limitation?",
"options": [
"Clusters merge automatically",
"Algorithm adapts perfectly",
"Cannot clearly separate overlapping clusters",
"Noise ignored"
],
"correctAnswerIndex": 2,
"explanation": "K-Means relies on distance; overlapping clusters may not be correctly assigned."
},
{
"id": 49,
"questionText": "Scenario: K-Means for market segmentation. Use case?",
"options": [
"Remove outliers automatically",
"Identify customer groups for targeted campaigns",
"Predict stock prices",
"Visualize time series"
],
"correctAnswerIndex": 1,
"explanation": "K-Means clusters similar customers to enable targeted marketing strategies."
},
{
"id": 50,
"questionText": "Scenario: K-Means for anomaly detection in credit card transactions. Approach?",
"options": [
"Transactions far from cluster centroids may be fraudulent",
"All transactions treated equally",
"Clusters merge automatically",
"Noise ignored"
],
"correctAnswerIndex": 0,
"explanation": "Outliers distant from normal clusters can indicate anomalous or fraudulent activity."
},
{
"id": 51,
"questionText": "Scenario: K-Means on high-dimensional gene expression data. Best practice?",
"options": [
"Clusters merge randomly",
"Use raw high-dimensional data directly",
"Increase K arbitrarily",
"Use PCA or dimensionality reduction before clustering"
],
"correctAnswerIndex": 3,
"explanation": "Dimensionality reduction helps meaningful clustering and avoids distance dilution."
},
{
"id": 52,
"questionText": "Scenario: K-Means on very large dataset. Speed-up technique?",
"options": [
"Mini-Batch K-Means",
"Increase K",
"Ignore convergence",
"Use raw data"
],
"correctAnswerIndex": 0,
"explanation": "Mini-Batch K-Means updates centroids using batches, reducing computation time."
},
{
"id": 53,
"questionText": "Scenario: K-Means applied to IoT sensor data with missing values. Solution?",
"options": [
"Impute missing values before clustering",
"Remove entire dataset",
"Assign clusters randomly",
"Ignore missing values"
],
"correctAnswerIndex": 0,
"explanation": "K-Means requires complete numerical data; missing values must be handled prior to clustering."
},
{
"id": 54,
"questionText": "Scenario: K-Means applied to customer purchase history. Challenge?",
"options": [
"Clusters automatically balanced",
"Sparse purchase data may lead to poor cluster separation",
"Algorithm converges perfectly",
"Noise ignored"
],
"correctAnswerIndex": 1,
"explanation": "Sparse or high-dimensional data can reduce clustering accuracy; preprocessing helps."
},
{
"id": 55,
"questionText": "Scenario: K-Means with categorical features encoded as numbers. Risk?",
"options": [
"Algorithm works perfectly",
"Clusters merge automatically",
"Noise ignored",
"Numerical encoding may introduce artificial distance relationships"
],
"correctAnswerIndex": 3,
"explanation": "Direct numeric encoding of categorical data can misrepresent similarity between categories."
},
{
"id": 56,
"questionText": "Scenario: K-Means for spatial clustering of stores. Best practice?",
"options": [
"Increase K arbitrarily",
"Clusters merge randomly",
"Use raw coordinates directly",
"Normalize coordinates or use appropriate distance metric"
],
"correctAnswerIndex": 3,
"explanation": "Scaling ensures coordinates are comparable and distance computations are accurate."
},
{
"id": 57,
"questionText": "Scenario: K-Means produces poor clustering. Possible reason?",
"options": [
"Data not suitable for K-Means (non-spherical or overlapping)",
"Centroids incorrect",
"Algorithm always finds perfect clusters",
"Distance metric irrelevant"
],
"correctAnswerIndex": 0,
"explanation": "K-Means struggles with non-spherical or overlapping clusters."
},
{
"id": 58,
"questionText": "Scenario: K-Means clustering on scaled features. Advantage?",
"options": [
"Distance metric changes",
"Clusters merge automatically",
"Prevents dominance by features with large range",
"Algorithm ignores scaling"
],
"correctAnswerIndex": 2,
"explanation": "Scaling ensures each feature contributes equally to Euclidean distance calculations."
},
{
"id": 59,
"questionText": "Scenario: K-Means with clusters of unequal variance. Issue?",
"options": [
"Noise ignored",
"Algorithm automatically adjusts",
"Clusters always compact",
"Clusters may not accurately represent data structure"
],
"correctAnswerIndex": 3,
"explanation": "K-Means assumes similar variance; large differences affect cluster quality."
},
{
"id": 60,
"questionText": "Scenario: K-Means applied to time-series data. Approach?",
"options": [
"Use raw sequences directly",
"Increase K arbitrarily",
"Clusters merge randomly",
"Extract meaningful features before clustering"
],
"correctAnswerIndex": 3,
"explanation": "Feature extraction ensures distance metrics are meaningful for time-series clustering."
},
{
"id": 61,
"questionText": "Scenario: K-Means clusters overlap. Evaluation metric?",
"options": [
"Use SSE only",
"Clusters merge randomly",
"Ignore overlap",
"Silhouette score measures separation and cohesion"
],
"correctAnswerIndex": 3,
"explanation": "Silhouette score evaluates how well points fit within their clusters vs others."
},
{
"id": 62,
"questionText": "Scenario: K-Means with too few clusters. Result?",
"options": [
"Clusters may merge dissimilar points, reducing interpretability",
"Algorithm adapts automatically",
"Clusters always compact",
"Noise ignored"
],
"correctAnswerIndex": 0,
"explanation": "Too small K forces dissimilar points into same cluster, reducing accuracy."
},
{
"id": 63,
"questionText": "Scenario: K-Means for market basket analysis. Limitation?",
"options": [
"Algorithm works perfectly",
"Noise ignored",
"Clusters merge automatically",
"Sparse and categorical data requires encoding or alternate methods"
],
"correctAnswerIndex": 3,
"explanation": "Sparse categorical data needs careful preprocessing or K-Prototypes instead of K-Means."
},
{
"id": 64,
"questionText": "Scenario: K-Means produces empty clusters. Solution?",
"options": [
"Ignore empty clusters",
"Algorithm fails automatically",
"Increase K randomly",
"Reinitialize centroids or reduce K"
],
"correctAnswerIndex": 3,
"explanation": "Reassigning centroids or reducing K resolves empty clusters."
},
{
"id": 65,
"questionText": "Scenario: K-Means with high-dimensional data. Challenge?",
"options": [
"Distance metrics lose meaning; reduce dimensions",
"Algorithm adapts automatically",
"Clusters always accurate",
"Noise ignored"
],
"correctAnswerIndex": 0,
"explanation": "High dimensions dilute distances, making clustering unreliable without dimensionality reduction."
},
{
"id": 66,
"questionText": "Scenario: K-Means on normalized vs unnormalized features. Effect?",
"options": [
"Algorithm automatically scales",
"Normalization ensures fair distance contribution across features",
"Unnormalized always better",
"Clusters merge randomly"
],
"correctAnswerIndex": 1,
"explanation": "Normalized features prevent features with large ranges from dominating clustering."
},
{
"id": 67,
"questionText": "Scenario: K-Means++ vs random initialization. Advantage?",
"options": [
"Improves clustering stability and convergence",
"Random initialization always better",
"No difference in results",
"Removes noise automatically"
],
"correctAnswerIndex": 0,
"explanation": "K-Means++ selects initial centroids to reduce poor local minima."
},
{
"id": 68,
"questionText": "Scenario: K-Means on customer purchase amounts. Data skewed. Solution?",
"options": [
"Use raw data",
"Log-transform or scale data before clustering",
"Increase K",
"Ignore skew"
],
"correctAnswerIndex": 1,
"explanation": "Transforming skewed data prevents high-value points from dominating clustering."
},
{
"id": 69,
"questionText": "Scenario: K-Means on text data after TF-IDF. Challenge?",
"options": [
"Algorithm works perfectly",
"High-dimensional sparse vectors; dimensionality reduction recommended",
"Noise ignored",
"Clusters merge randomly"
],
"correctAnswerIndex": 1,
"explanation": "Sparse high-dimensional TF-IDF vectors may reduce clustering effectiveness without reduction."
},
{
"id": 70,
"questionText": "Scenario: K-Means applied to IoT device readings. Best practice?",
"options": [
"Increase K randomly",
"Use raw readings",
"Ignore convergence",
"Normalize or scale features to ensure meaningful clustering"
],
"correctAnswerIndex": 3,
"explanation": "Scaling ensures that features contribute equally to distance calculations for clustering."
},
{
"id": 71,
"questionText": "Scenario: K-Means clustering applied to gene expression data with thousands of features. Best approach?",
"options": [
"Apply PCA or feature selection to reduce dimensionality before clustering",
"Randomly remove features",
"Increase K arbitrarily",
"Use all features directly"
],
"correctAnswerIndex": 0,
"explanation": "High-dimensional gene data can dilute distances; dimensionality reduction ensures meaningful clusters."
},
{
"id": 72,
"questionText": "Scenario: K-Means with very large K relative to dataset size. Risk?",
"options": [
"Clusters may be meaningless or empty",
"Algorithm automatically adjusts",
"Distance metric ignored",
"Clusters merge automatically"
],
"correctAnswerIndex": 0,
"explanation": "Too many clusters can lead to tiny or empty clusters with no interpretability."
},
{
"id": 73,
"questionText": "Scenario: K-Means on data with non-uniform density clusters. Limitation?",
"options": [
"Noise ignored",
"Low-density clusters may merge with high-density ones",
"Algorithm adjusts automatically",
"Clusters always compact"
],
"correctAnswerIndex": 1,
"explanation": "K-Means does not handle varying densities well; denser clusters dominate centroid assignment."
},
{
"id": 74,
"questionText": "Scenario: K-Means on highly skewed financial transaction data. Best preprocessing?",
"options": [
"Apply log transformation to reduce skew before clustering",
"Use raw data",
"Clusters merge randomly",
"Increase K arbitrarily"
],
"correctAnswerIndex": 0,
"explanation": "Log or other transformations reduce the effect of extreme values, improving clustering quality."
},
{
"id": 75,
"questionText": "Scenario: K-Means on time-series data. Effective method?",
"options": [
"Extract meaningful features such as trends or seasonal components before clustering",
"Use raw sequences directly",
"Clusters merge automatically",
"Increase K randomly"
],
"correctAnswerIndex": 0,
"explanation": "Feature extraction ensures distances reflect meaningful similarities in time-series."
},
{
"id": 76,
"questionText": "Scenario: K-Means clustering for anomaly detection in network traffic. Strategy?",
"options": [
"All points treated equally",
"Points far from cluster centroids are likely anomalies",
"Noise ignored",
"Clusters merge automatically"
],
"correctAnswerIndex": 1,
"explanation": "Outliers distant from normal traffic clusters are potential anomalies."
},
{
"id": 77,
"questionText": "Scenario: K-Means applied to image color compression. Challenge?",
"options": [
"Algorithm automatically selects K",
"All clusters identical",
"Noise ignored",
"Choosing optimal K to balance compression and image quality"
],
"correctAnswerIndex": 3,
"explanation": "Selecting K is critical; too few clusters lose color details, too many reduce compression."
},
{
"id": 78,
"questionText": "Scenario: K-Means++ vs multiple random initializations. Advantage of K-Means++?",
"options": [
"Reduces likelihood of poor local minima and improves convergence",
"Removes noise automatically",
"Random initializations are better",
"No difference in results"
],
"correctAnswerIndex": 0,
"explanation": "K-Means++ selects initial centroids that are distant, improving stability and cluster quality."
},
{
"id": 79,
"questionText": "Scenario: K-Means applied to sparse TF-IDF text vectors. Best approach?",
"options": [
"Use raw sparse vectors directly",
"Increase K arbitrarily",
"Reduce dimensionality using techniques like Truncated SVD before clustering",
"Clusters merge randomly"
],
"correctAnswerIndex": 2,
"explanation": "High-dimensional sparse data may produce poor clusters; dimensionality reduction improves performance."
},
{
"id": 80,
"questionText": "Scenario: K-Means clustering with overlapping spherical clusters. How to improve?",
"options": [
"K-Means always works",
"Clusters merge automatically",
"Reduce K randomly",
"Use Gaussian Mixture Models (GMM) for soft clustering"
],
"correctAnswerIndex": 3,
"explanation": "GMM can model cluster overlap using probability distributions, unlike hard K-Means assignments."
},
{
"id": 81,
"questionText": "Scenario: K-Means for customer segmentation with categorical attributes. Best practice?",
"options": [
"Use K-Prototypes or encode categories numerically",
"Ignore categorical data",
"Use standard K-Means directly",
"Clusters merge randomly"
],
"correctAnswerIndex": 0,
"explanation": "K-Prototypes handles mixed numerical and categorical data effectively."
},
{
"id": 82,
"questionText": "Scenario: K-Means convergence to local minimum. Cause?",
"options": [
"Distance metric incorrect",
"Clusters too compact",
"Poor or random initialization of centroids",
"Algorithm always finds global minimum"
],
"correctAnswerIndex": 2,
"explanation": "K-Means may converge to suboptimal solutions depending on initial centroids."
},
{
"id": 83,
"questionText": "Scenario: K-Means applied to geospatial clustering. Recommendation?",
"options": [
"Increase K randomly",
"Use appropriate distance metrics like haversine for coordinates",
"Clusters merge arbitrarily",
"Use Euclidean distance blindly"
],
"correctAnswerIndex": 1,
"explanation": "Geographic distances require correct metric to ensure accurate clustering."
},
{
"id": 84,
"questionText": "Scenario: K-Means with very large datasets. Efficient solution?",
"options": [
"Use Mini-Batch K-Means",
"Increase K arbitrarily",
"Ignore convergence",
"Use full dataset only"
],
"correctAnswerIndex": 0,
"explanation": "Mini-Batch K-Means speeds up computation by using small random batches for centroid updates."
},
{
"id": 85,
"questionText": "Scenario: K-Means on noisy IoT sensor data. Best preprocessing?",
"options": [
"Filter or smooth noise before clustering",
"Clusters merge automatically",
"Increase K arbitrarily",
"Use raw data"
],
"correctAnswerIndex": 0,
"explanation": "Noise can distort centroids; preprocessing improves clustering reliability."
},
{
"id": 86,
"questionText": "Scenario: K-Means on very high-dimensional data. Limitation?",
"options": [
"Distance metrics lose meaning; dimensionality reduction recommended",
"Noise ignored",
"Clusters always accurate",
"Algorithm adapts automatically"
],
"correctAnswerIndex": 0,
"explanation": "High-dimensional spaces dilute distances, leading to poor cluster assignments."
},
{
"id": 87,
"questionText": "Scenario: K-Means for anomaly detection in healthcare data. Approach?",
"options": [
"Noise ignored",
"Points far from cluster centroids may indicate anomalies",
"Clusters merge automatically",
"All points treated equally"
],
"correctAnswerIndex": 1,
"explanation": "Outliers distant from normal clusters can indicate anomalies or rare events."
},
{
"id": 88,
"questionText": "Scenario: K-Means on image segmentation with varying illumination. Challenge?",
"options": [
"Preprocessing like normalization is needed to reduce lighting effect",
"Algorithm works perfectly",
"Increase K randomly",
"Clusters merge automatically"
],
"correctAnswerIndex": 0,
"explanation": "Differences in lighting affect pixel values; normalization improves clustering consistency."
},
{
"id": 89,
"questionText": "Scenario: K-Means for market segmentation with mixed purchase behavior. Solution?",
"options": [
"Ignore categorical data",
"Clusters merge randomly",
"Use numerical encoding or K-Prototypes for categorical and numerical features",
"Use standard K-Means directly"
],
"correctAnswerIndex": 2,
"explanation": "Mixed data requires specialized clustering methods for meaningful segmentation."
},
{
"id": 90,
"questionText": "Scenario: K-Means clustering produces empty clusters repeatedly. Best solution?",
"options": [
"Algorithm fails automatically",
"Increase K arbitrarily",
"Ignore empty clusters",
"Reinitialize centroids or reduce K"
],
"correctAnswerIndex": 3,
"explanation": "Empty clusters occur when centroids have no assigned points; reinitialization or lowering K resolves this."
},
{
"id": 91,
"questionText": "Scenario: K-Means applied to highly imbalanced datasets. Issue?",
"options": [
"Large clusters may dominate, small clusters underrepresented",
"Clusters always balanced",
"Noise ignored",
"Algorithm adapts automatically"
],
"correctAnswerIndex": 0,
"explanation": "K-Means does not account for cluster size; imbalance may distort results."
},
{
"id": 92,
"questionText": "Scenario: K-Means applied to network traffic logs for intrusion detection. Best approach?",
"options": [
"Use raw logs directly",
"Increase K arbitrarily",
"Preprocess logs into numerical features and detect points far from centroids",
"Clusters merge automatically"
],
"correctAnswerIndex": 2,
"explanation": "Transforming logs to numerical vectors enables clustering and anomaly detection."
},
{
"id": 93,
"questionText": "Scenario: K-Means clustering with multiple valid K values. Evaluation metric?",
"options": [
"Silhouette score to evaluate cluster quality",
"Ignore K selection",
"Use SSE only",
"Clusters merge randomly"
],
"correctAnswerIndex": 0,
"explanation": "Silhouette score measures cohesion and separation, helping choose optimal K."
},
{
"id": 94,
"questionText": "Scenario: K-Means applied to text clustering using word embeddings. Limitation?",
"options": [
"High-dimensional vectors may require dimensionality reduction or normalization",
"Clusters merge randomly",
"Algorithm works perfectly",
"Noise ignored"
],
"correctAnswerIndex": 0,
"explanation": "Dimensionality reduction and normalization improve clustering accuracy for embeddings."
},
{
"id": 95,
"questionText": "Scenario: K-Means clustering results vary on repeated runs. Best solution?",
"options": [
"Use K-Means++ initialization or multiple runs",
"Clusters merge randomly",
"Ignore variations",
"Increase K arbitrarily"
],
"correctAnswerIndex": 0,
"explanation": "Better initialization reduces sensitivity to random centroid placement."
},
{
"id": 96,
"questionText": "Scenario: K-Means on scaled features vs unscaled features. Observation?",
"options": [
"Scaling ensures fair contribution of all features to distance calculation",
"Clusters merge randomly",
"Algorithm adapts automatically",
"Scaling is unnecessary"
],
"correctAnswerIndex": 0,
"explanation": "Without scaling, features with larger ranges dominate cluster assignments."
},
{
"id": 97,
"questionText": "Scenario: K-Means clustering on overlapping clusters. Alternative?",
"options": [
"Reduce K randomly",
"Clusters merge automatically",
"Use soft clustering like Gaussian Mixture Models",
"K-Means handles overlap perfectly"
],
"correctAnswerIndex": 2,
"explanation": "Soft clustering models allow points to belong probabilistically to multiple clusters."
},
{
"id": 98,
"questionText": "Scenario: K-Means applied to sensor network data with missing values. Solution?",
"options": [
"Impute missing values before clustering",
"Assign clusters randomly",
"Remove entire dataset",
"Ignore missing values"
],
"correctAnswerIndex": 0,
"explanation": "K-Means requires complete data; missing values must be handled prior to clustering."
},
{
"id": 99,
"questionText": "Scenario: K-Means on customer behavior data with high variance features. Best approach?",
"options": [
"Increase K arbitrarily",
"Clusters merge randomly",
"Use raw data",
"Scale or normalize features to prevent dominance by high-variance features"
],
"correctAnswerIndex": 3,
"explanation": "Scaling ensures fair contribution of each feature to distance computation."
},
{
"id": 100,
"questionText": "Scenario: K-Means applied to a large dataset with many outliers. Recommendation?",
"options": [
"Increase K arbitrarily",
"Clusters merge automatically",
"Use raw data directly",
"Preprocess to remove or handle outliers before clustering"
],
"correctAnswerIndex": 3,
"explanation": "Outliers distort centroids; preprocessing ensures meaningful cluster assignments."
}
]
}