| { | |
| "title": "K-Means Clustering Mastery: 100 MCQs", | |
| "description": "A comprehensive set of 100 multiple-choice questions designed to test and deepen your understanding of K-Means Clustering, covering basic concepts, algorithm steps, practical scenarios, and challenges in high-dimensional or real-world datasets.", | |
| "questions": [ | |
| { | |
| "id": 1, | |
| "questionText": "What is the primary goal of K-Means Clustering?", | |
| "options": [ | |
| "Partition data into K clusters minimizing within-cluster variance", | |
| "Reduce the dimensionality of the dataset", | |
| "Detect outliers in the dataset", | |
| "Classify data into predefined categories" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means aims to divide data into K clusters such that the sum of squared distances between points and their cluster centroid is minimized." | |
| }, | |
| { | |
| "id": 2, | |
| "questionText": "In K-Means, what does a 'centroid' represent?", | |
| "options": [ | |
| "A random point from the dataset", | |
| "The farthest point from the cluster", | |
| "The maximum value in the cluster", | |
| "The mean position of all points in the cluster" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Centroid is the mean of all points in a cluster and represents the cluster's center." | |
| }, | |
| { | |
| "id": 3, | |
| "questionText": "Which step is repeated in K-Means until convergence?", | |
| "options": [ | |
| "Compute correlation matrix", | |
| "Assign points to nearest centroid and update centroids", | |
| "Remove outliers", | |
| "Randomly shuffle data points" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "K-Means iteratively assigns points to the nearest centroid and recalculates centroids until assignments stabilize." | |
| }, | |
| { | |
| "id": 4, | |
| "questionText": "Scenario: K-Means converges but clusters are uneven in size. Likely reason?", | |
| "options": [ | |
| "Centroids are incorrect", | |
| "Distance metric used is Euclidean", | |
| "Data distribution is skewed", | |
| "Algorithm failed" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "K-Means partitions based on distances; skewed or non-spherical distributions can lead to uneven cluster sizes." | |
| }, | |
| { | |
| "id": 5, | |
| "questionText": "What is the main limitation of K-Means clustering?", | |
| "options": [ | |
| "Sensitive to outliers", | |
| "Requires predefined number of clusters (K)", | |
| "Only works for numerical data", | |
| "All of the above" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "K-Means works only with numerical data, needs K as input, and is sensitive to outliers." | |
| }, | |
| { | |
| "id": 6, | |
| "questionText": "Scenario: K-Means applied to customer locations. Distance metric to use?", | |
| "options": [ | |
| "Euclidean distance", | |
| "Hamming distance", | |
| "Cosine similarity", | |
| "Jaccard index" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Euclidean distance is standard for K-Means and spatial numerical data." | |
| }, | |
| { | |
| "id": 7, | |
| "questionText": "Scenario: K-Means on 2D points results vary with different initial centroids. Solution?", | |
| "options": [ | |
| "Use hierarchical clustering instead", | |
| "Ignore initial centroids", | |
| "Reduce K", | |
| "Use K-Means++ initialization" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "K-Means++ selects better initial centroids to improve convergence and consistency." | |
| }, | |
| { | |
| "id": 8, | |
| "questionText": "Scenario: K-Means on concentric circles fails. Reason?", | |
| "options": [ | |
| "K-Means assumes spherical clusters", | |
| "Data contains outliers", | |
| "Distance metric wrong", | |
| "Algorithm converged too quickly" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means works best for convex, spherical clusters; it cannot separate concentric circular clusters." | |
| }, | |
| { | |
| "id": 9, | |
| "questionText": "Scenario: After K-Means clustering, silhouette score is low. Interpretation?", | |
| "options": [ | |
| "Noise ignored automatically", | |
| "Clusters overlap or poorly defined", | |
| "Clusters are perfect", | |
| "Algorithm converged correctly" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Low silhouette score indicates points are close to neighboring cluster centroids; clusters are not well separated." | |
| }, | |
| { | |
| "id": 10, | |
| "questionText": "Scenario: Large dataset with millions of points. K-Means limitation?", | |
| "options": [ | |
| "Algorithm fails completely", | |
| "Distance metric is irrelevant", | |
| "Cannot calculate centroids", | |
| "Convergence can be slow; consider Mini-Batch K-Means" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Mini-Batch K-Means is a faster variant suitable for large datasets." | |
| }, | |
| { | |
| "id": 11, | |
| "questionText": "Scenario: K-Means on a dataset with outliers. Effect?", | |
| "options": [ | |
| "Algorithm removes outliers", | |
| "Centroids can shift towards outliers, distorting clusters", | |
| "Clusters become more compact", | |
| "Clusters ignore outliers automatically" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Outliers can disproportionately affect centroids, leading to poorly defined clusters." | |
| }, | |
| { | |
| "id": 12, | |
| "questionText": "Scenario: K-Means on categorical data. Limitation?", | |
| "options": [ | |
| "K-Means requires numerical data; cannot handle categorical directly", | |
| "Clusters merge randomly", | |
| "Categorical data improves clustering", | |
| "Algorithm automatically encodes categories" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means relies on distance metrics, which are not directly defined for categorical data." | |
| }, | |
| { | |
| "id": 13, | |
| "questionText": "Scenario: Selecting K for K-Means. Which method helps?", | |
| "options": [ | |
| "Merge dendrograms", | |
| "Random selection", | |
| "Elbow method", | |
| "Silhouette ignored" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The Elbow method plots sum of squared errors vs K and identifies an 'elbow' point as optimal K." | |
| }, | |
| { | |
| "id": 14, | |
| "questionText": "Scenario: K-Means fails to separate overlapping clusters. Likely reason?", | |
| "options": [ | |
| "K too small", | |
| "Centroids are optimal", | |
| "Algorithm converged correctly", | |
| "Clusters are not well-separated or non-convex" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "K-Means works best for well-separated convex clusters; overlapping clusters are challenging." | |
| }, | |
| { | |
| "id": 15, | |
| "questionText": "Scenario: K-Means with high-dimensional data. Challenge?", | |
| "options": [ | |
| "Algorithm fails automatically", | |
| "Noise ignored", | |
| "Distance metrics lose meaning; consider PCA or feature scaling", | |
| "Clusters are always compact" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "High-dimensional spaces dilute distances; dimensionality reduction improves clustering." | |
| }, | |
| { | |
| "id": 16, | |
| "questionText": "Scenario: K-Means with K too large. Effect?", | |
| "options": [ | |
| "Algorithm automatically reduces K", | |
| "Clusters always improve", | |
| "Clusters may become small and meaningless", | |
| "Noise ignored" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Choosing K too large can lead to many tiny clusters with no meaningful pattern." | |
| }, | |
| { | |
| "id": 17, | |
| "questionText": "Scenario: Mini-Batch K-Means. Advantage?", | |
| "options": [ | |
| "Faster for large datasets with approximate centroids", | |
| "Removes noise automatically", | |
| "More accurate than standard K-Means", | |
| "Works only on small datasets" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Mini-Batch K-Means updates centroids using small random batches for efficiency on large datasets." | |
| }, | |
| { | |
| "id": 18, | |
| "questionText": "Scenario: K-Means initialization affects results. Solution?", | |
| "options": [ | |
| "Use single random centroid only", | |
| "Run algorithm multiple times or use K-Means++", | |
| "Ignore initialization", | |
| "Reduce K randomly" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "K-Means++ and multiple runs improve stability and reduce sensitivity to initial centroids." | |
| }, | |
| { | |
| "id": 19, | |
| "questionText": "Scenario: K-Means distance metric. Standard choice?", | |
| "options": [ | |
| "Euclidean distance", | |
| "Jaccard index", | |
| "Hamming distance", | |
| "Cosine similarity" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means typically uses Euclidean distance to assign points to nearest centroids." | |
| }, | |
| { | |
| "id": 20, | |
| "questionText": "Scenario: K-Means clustering produces empty clusters. Cause?", | |
| "options": [ | |
| "Algorithm failed", | |
| "Clusters are compact", | |
| "No points assigned to some centroids", | |
| "Centroids are optimal" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Some centroids may not attract any points, leading to empty clusters." | |
| }, | |
| { | |
| "id": 21, | |
| "questionText": "Scenario: K-Means on text embeddings. Preprocessing required?", | |
| "options": [ | |
| "Use categorical K-Means directly", | |
| "No preprocessing needed", | |
| "Randomly assign clusters", | |
| "Normalize or scale vectors before clustering" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Text embeddings often need normalization to prevent certain dimensions from dominating distance computations." | |
| }, | |
| { | |
| "id": 22, | |
| "questionText": "Scenario: K-Means with very high K. Effect on SSE (sum of squared errors)?", | |
| "options": [ | |
| "SSE increases", | |
| "SSE is ignored", | |
| "SSE decreases as K increases", | |
| "SSE remains constant" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "As K increases, each cluster contains fewer points, reducing the sum of squared distances." | |
| }, | |
| { | |
| "id": 23, | |
| "questionText": "Scenario: K-Means on scaled vs unscaled features. Effect?", | |
| "options": [ | |
| "Clusters merge randomly", | |
| "Scaling is important; features with large range dominate clustering", | |
| "Algorithm fails if not scaled", | |
| "Scaling is unnecessary" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Features with larger numerical ranges can dominate Euclidean distance; scaling ensures fair contribution." | |
| }, | |
| { | |
| "id": 24, | |
| "questionText": "Scenario: K-Means convergence criteria. Standard check?", | |
| "options": [ | |
| "Centroid positions or cluster assignments stop changing", | |
| "Distance metric ignored", | |
| "Random stopping", | |
| "Maximum iterations only" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Algorithm stops when centroids or cluster assignments stabilize, or after a max number of iterations." | |
| }, | |
| { | |
| "id": 25, | |
| "questionText": "Scenario: K-Means for image compression. How?", | |
| "options": [ | |
| "Cluster pixel colors and replace each pixel by its centroid", | |
| "Use hierarchical clustering", | |
| "Remove noise automatically", | |
| "Reduce image resolution" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means clusters similar colors, allowing image compression by using cluster centroids as representative colors." | |
| }, | |
| { | |
| "id": 26, | |
| "questionText": "Scenario: K-Means for customer segmentation. Benefit?", | |
| "options": [ | |
| "Identify customer groups for targeted marketing", | |
| "Automatically predicts sales", | |
| "Detects trends over time", | |
| "Removes outliers" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means helps segment customers based on behavior, allowing targeted campaigns." | |
| }, | |
| { | |
| "id": 27, | |
| "questionText": "Scenario: K-Means clustering results differ on repeated runs. Cause?", | |
| "options": [ | |
| "Distance metric varies", | |
| "Random initialization of centroids", | |
| "Algorithm deterministic", | |
| "Clusters merge randomly" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Random initial centroids can lead to different final clusters; K-Means++ mitigates this." | |
| }, | |
| { | |
| "id": 28, | |
| "questionText": "Scenario: K-Means for anomaly detection. Approach?", | |
| "options": [ | |
| "Clusters merge randomly", | |
| "Points far from nearest centroid may be anomalies", | |
| "All points treated equally", | |
| "Noise automatically ignored" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Outliers are detected as points distant from cluster centroids." | |
| }, | |
| { | |
| "id": 29, | |
| "questionText": "Scenario: K-Means clustering on geospatial data. Best practice?", | |
| "options": [ | |
| "Use distance metric appropriate for coordinates (e.g., haversine)", | |
| "Randomly assign clusters", | |
| "Use Euclidean blindly", | |
| "Clusters merge arbitrarily" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Euclidean distance may misrepresent geographic distances; use geodesic metrics like haversine." | |
| }, | |
| { | |
| "id": 30, | |
| "questionText": "Scenario: K-Means with highly correlated features. Solution?", | |
| "options": [ | |
| "Increase K randomly", | |
| "Apply PCA to reduce correlated dimensions", | |
| "Ignore correlations", | |
| "Clusters merge arbitrarily" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "PCA reduces correlated features and improves clustering performance." | |
| }, | |
| { | |
| "id": 31, | |
| "questionText": "Scenario: K-Means on non-spherical clusters. Limitation?", | |
| "options": [ | |
| "Noise ignored", | |
| "Clusters are always compact", | |
| "Algorithm automatically adapts", | |
| "K-Means assumes spherical clusters; non-spherical clusters may be poorly separated" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "K-Means relies on Euclidean distance and assumes roughly spherical clusters, so elongated or irregular clusters are not well captured." | |
| }, | |
| { | |
| "id": 32, | |
| "questionText": "Scenario: K-Means clustering produces clusters with very different densities. Challenge?", | |
| "options": [ | |
| "Distance metric ignored", | |
| "Clusters always equal", | |
| "Low-density clusters may be merged incorrectly", | |
| "Algorithm detects densities automatically" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "K-Means does not account for density; clusters with differing densities may not be separated properly." | |
| }, | |
| { | |
| "id": 33, | |
| "questionText": "Scenario: K-Means applied to text embeddings. Best practice?", | |
| "options": [ | |
| "Increase K arbitrarily", | |
| "Normalize embeddings to unit vectors before clustering", | |
| "Remove half the features randomly", | |
| "Use raw embeddings" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Normalization ensures that distance computation reflects angle similarity rather than magnitude differences." | |
| }, | |
| { | |
| "id": 34, | |
| "questionText": "Scenario: K-Means clustering with missing values. Approach?", | |
| "options": [ | |
| "Ignore missing values", | |
| "Randomly assign missing values", | |
| "Algorithm automatically handles them", | |
| "Impute missing values before clustering" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "K-Means requires complete numerical data; missing values should be imputed or removed." | |
| }, | |
| { | |
| "id": 35, | |
| "questionText": "Scenario: K-Means clustering on multi-dimensional customer features. Preprocessing step?", | |
| "options": [ | |
| "Randomly drop features", | |
| "Scale features so all dimensions contribute equally", | |
| "Increase K arbitrarily", | |
| "Leave features unscaled" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Feature scaling ensures that dimensions with larger ranges do not dominate Euclidean distance." | |
| }, | |
| { | |
| "id": 36, | |
| "questionText": "Scenario: K-Means on a dataset with outliers. Solution?", | |
| "options": [ | |
| "Use standard K-Means without changes", | |
| "Randomly assign clusters", | |
| "Increase K to compensate", | |
| "Remove or preprocess outliers before clustering" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Outliers can distort centroids; preprocessing improves clustering accuracy." | |
| }, | |
| { | |
| "id": 37, | |
| "questionText": "Scenario: K-Means convergence too slow. Solution?", | |
| "options": [ | |
| "Ignore convergence", | |
| "Change distance metric arbitrarily", | |
| "Use Mini-Batch K-Means or reduce dataset size", | |
| "Increase K randomly" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Mini-Batch K-Means or subsampling speeds up convergence for large datasets." | |
| }, | |
| { | |
| "id": 38, | |
| "questionText": "Scenario: K-Means clustering with highly correlated features. Best approach?", | |
| "options": [ | |
| "Increase K", | |
| "Ignore correlation", | |
| "Merge clusters arbitrarily", | |
| "Apply PCA or feature selection to reduce redundancy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Reducing correlated dimensions prevents redundant information from biasing distance calculations." | |
| }, | |
| { | |
| "id": 39, | |
| "questionText": "Scenario: K-Means clustering on skewed data. Issue?", | |
| "options": [ | |
| "Noise ignored", | |
| "Algorithm corrects automatically", | |
| "Clusters may be biased towards dense regions", | |
| "Clusters always balanced" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Skewed distributions can lead to unequal cluster sizes or poorly defined boundaries." | |
| }, | |
| { | |
| "id": 40, | |
| "questionText": "Scenario: K-Means with K unknown. Methods to select K?", | |
| "options": [ | |
| "Random choice", | |
| "Algorithm decides automatically", | |
| "Use maximum data points", | |
| "Elbow method, silhouette score, gap statistic" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "These methods help determine optimal K by evaluating clustering performance." | |
| }, | |
| { | |
| "id": 41, | |
| "questionText": "Scenario: K-Means produces very similar clusters on repeated runs. Possible reason?", | |
| "options": [ | |
| "Algorithm converged incorrectly", | |
| "Data naturally forms stable clusters", | |
| "Distance metric is wrong", | |
| "Initialization randomization failed" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "If data has well-separated clusters, K-Means results are stable across runs." | |
| }, | |
| { | |
| "id": 42, | |
| "questionText": "Scenario: K-Means on a small dataset with large K. Risk?", | |
| "options": [ | |
| "Algorithm fails completely", | |
| "Centroids ignored", | |
| "Clusters may be too small or empty", | |
| "Clusters automatically merge" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Too many clusters for small datasets can produce meaningless or empty clusters." | |
| }, | |
| { | |
| "id": 43, | |
| "questionText": "Scenario: K-Means++ initialization. Benefit?", | |
| "options": [ | |
| "Improves cluster quality by selecting distant initial centroids", | |
| "Random initialization", | |
| "Always produces identical clusters", | |
| "Removes noise automatically" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means++ reduces poor initialization by spreading centroids apart." | |
| }, | |
| { | |
| "id": 44, | |
| "questionText": "Scenario: K-Means with categorical features. Solution?", | |
| "options": [ | |
| "Use K-Prototypes or encode categories numerically", | |
| "Clusters merge randomly", | |
| "Ignore categorical data", | |
| "Use standard K-Means directly" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Standard K-Means cannot handle categorical data; K-Prototypes or encoding is needed." | |
| }, | |
| { | |
| "id": 45, | |
| "questionText": "Scenario: K-Means on noisy sensor data. Best practice?", | |
| "options": [ | |
| "Use raw data", | |
| "Increase K arbitrarily", | |
| "Filter or preprocess noise before clustering", | |
| "Ignore convergence" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Noise affects centroids and cluster assignment; preprocessing improves results." | |
| }, | |
| { | |
| "id": 46, | |
| "questionText": "Scenario: K-Means for image segmentation. Metric for colors?", | |
| "options": [ | |
| "Cosine similarity", | |
| "Euclidean distance in RGB or LAB space", | |
| "Hamming distance", | |
| "Jaccard index" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Euclidean distance is standard for numerical pixel features in color space." | |
| }, | |
| { | |
| "id": 47, | |
| "questionText": "Scenario: K-Means convergence to local minimum. Reason?", | |
| "options": [ | |
| "Poor initialization of centroids", | |
| "Algorithm always finds global minimum", | |
| "Clusters are too compact", | |
| "Distance metric is incorrect" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Random initial centroids can lead K-Means to converge to suboptimal local minima." | |
| }, | |
| { | |
| "id": 48, | |
| "questionText": "Scenario: K-Means clustering with overlapping clusters. Limitation?", | |
| "options": [ | |
| "Clusters merge automatically", | |
| "Algorithm adapts perfectly", | |
| "Cannot clearly separate overlapping clusters", | |
| "Noise ignored" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "K-Means relies on distance; overlapping clusters may not be correctly assigned." | |
| }, | |
| { | |
| "id": 49, | |
| "questionText": "Scenario: K-Means for market segmentation. Use case?", | |
| "options": [ | |
| "Remove outliers automatically", | |
| "Identify customer groups for targeted campaigns", | |
| "Predict stock prices", | |
| "Visualize time series" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "K-Means clusters similar customers to enable targeted marketing strategies." | |
| }, | |
| { | |
| "id": 50, | |
| "questionText": "Scenario: K-Means for anomaly detection in credit card transactions. Approach?", | |
| "options": [ | |
| "Transactions far from cluster centroids may be fraudulent", | |
| "All transactions treated equally", | |
| "Clusters merge automatically", | |
| "Noise ignored" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Outliers distant from normal clusters can indicate anomalous or fraudulent activity." | |
| }, | |
| { | |
| "id": 51, | |
| "questionText": "Scenario: K-Means on high-dimensional gene expression data. Best practice?", | |
| "options": [ | |
| "Clusters merge randomly", | |
| "Use raw high-dimensional data directly", | |
| "Increase K arbitrarily", | |
| "Use PCA or dimensionality reduction before clustering" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Dimensionality reduction helps meaningful clustering and avoids distance dilution." | |
| }, | |
| { | |
| "id": 52, | |
| "questionText": "Scenario: K-Means on very large dataset. Speed-up technique?", | |
| "options": [ | |
| "Mini-Batch K-Means", | |
| "Increase K", | |
| "Ignore convergence", | |
| "Use raw data" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Mini-Batch K-Means updates centroids using batches, reducing computation time." | |
| }, | |
| { | |
| "id": 53, | |
| "questionText": "Scenario: K-Means applied to IoT sensor data with missing values. Solution?", | |
| "options": [ | |
| "Impute missing values before clustering", | |
| "Remove entire dataset", | |
| "Assign clusters randomly", | |
| "Ignore missing values" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means requires complete numerical data; missing values must be handled prior to clustering." | |
| }, | |
| { | |
| "id": 54, | |
| "questionText": "Scenario: K-Means applied to customer purchase history. Challenge?", | |
| "options": [ | |
| "Clusters automatically balanced", | |
| "Sparse purchase data may lead to poor cluster separation", | |
| "Algorithm converges perfectly", | |
| "Noise ignored" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Sparse or high-dimensional data can reduce clustering accuracy; preprocessing helps." | |
| }, | |
| { | |
| "id": 55, | |
| "questionText": "Scenario: K-Means with categorical features encoded as numbers. Risk?", | |
| "options": [ | |
| "Algorithm works perfectly", | |
| "Clusters merge automatically", | |
| "Noise ignored", | |
| "Numerical encoding may introduce artificial distance relationships" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Direct numeric encoding of categorical data can misrepresent similarity between categories." | |
| }, | |
| { | |
| "id": 56, | |
| "questionText": "Scenario: K-Means for spatial clustering of stores. Best practice?", | |
| "options": [ | |
| "Increase K arbitrarily", | |
| "Clusters merge randomly", | |
| "Use raw coordinates directly", | |
| "Normalize coordinates or use appropriate distance metric" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Scaling ensures coordinates are comparable and distance computations are accurate." | |
| }, | |
| { | |
| "id": 57, | |
| "questionText": "Scenario: K-Means produces poor clustering. Possible reason?", | |
| "options": [ | |
| "Data not suitable for K-Means (non-spherical or overlapping)", | |
| "Centroids incorrect", | |
| "Algorithm always finds perfect clusters", | |
| "Distance metric irrelevant" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means struggles with non-spherical or overlapping clusters." | |
| }, | |
| { | |
| "id": 58, | |
| "questionText": "Scenario: K-Means clustering on scaled features. Advantage?", | |
| "options": [ | |
| "Distance metric changes", | |
| "Clusters merge automatically", | |
| "Prevents dominance by features with large range", | |
| "Algorithm ignores scaling" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Scaling ensures each feature contributes equally to Euclidean distance calculations." | |
| }, | |
| { | |
| "id": 59, | |
| "questionText": "Scenario: K-Means with clusters of unequal variance. Issue?", | |
| "options": [ | |
| "Noise ignored", | |
| "Algorithm automatically adjusts", | |
| "Clusters always compact", | |
| "Clusters may not accurately represent data structure" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "K-Means assumes similar variance; large differences affect cluster quality." | |
| }, | |
| { | |
| "id": 60, | |
| "questionText": "Scenario: K-Means applied to time-series data. Approach?", | |
| "options": [ | |
| "Use raw sequences directly", | |
| "Increase K arbitrarily", | |
| "Clusters merge randomly", | |
| "Extract meaningful features before clustering" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Feature extraction ensures distance metrics are meaningful for time-series clustering." | |
| }, | |
| { | |
| "id": 61, | |
| "questionText": "Scenario: K-Means clusters overlap. Evaluation metric?", | |
| "options": [ | |
| "Use SSE only", | |
| "Clusters merge randomly", | |
| "Ignore overlap", | |
| "Silhouette score measures separation and cohesion" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Silhouette score evaluates how well points fit within their clusters vs others." | |
| }, | |
| { | |
| "id": 62, | |
| "questionText": "Scenario: K-Means with too few clusters. Result?", | |
| "options": [ | |
| "Clusters may merge dissimilar points, reducing interpretability", | |
| "Algorithm adapts automatically", | |
| "Clusters always compact", | |
| "Noise ignored" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Too small K forces dissimilar points into same cluster, reducing accuracy." | |
| }, | |
| { | |
| "id": 63, | |
| "questionText": "Scenario: K-Means for market basket analysis. Limitation?", | |
| "options": [ | |
| "Algorithm works perfectly", | |
| "Noise ignored", | |
| "Clusters merge automatically", | |
| "Sparse and categorical data requires encoding or alternate methods" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Sparse categorical data needs careful preprocessing or K-Prototypes instead of K-Means." | |
| }, | |
| { | |
| "id": 64, | |
| "questionText": "Scenario: K-Means produces empty clusters. Solution?", | |
| "options": [ | |
| "Ignore empty clusters", | |
| "Algorithm fails automatically", | |
| "Increase K randomly", | |
| "Reinitialize centroids or reduce K" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Reassigning centroids or reducing K resolves empty clusters." | |
| }, | |
| { | |
| "id": 65, | |
| "questionText": "Scenario: K-Means with high-dimensional data. Challenge?", | |
| "options": [ | |
| "Distance metrics lose meaning; reduce dimensions", | |
| "Algorithm adapts automatically", | |
| "Clusters always accurate", | |
| "Noise ignored" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "High dimensions dilute distances, making clustering unreliable without dimensionality reduction." | |
| }, | |
| { | |
| "id": 66, | |
| "questionText": "Scenario: K-Means on normalized vs unnormalized features. Effect?", | |
| "options": [ | |
| "Algorithm automatically scales", | |
| "Normalization ensures fair distance contribution across features", | |
| "Unnormalized always better", | |
| "Clusters merge randomly" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Normalized features prevent features with large ranges from dominating clustering." | |
| }, | |
| { | |
| "id": 67, | |
| "questionText": "Scenario: K-Means++ vs random initialization. Advantage?", | |
| "options": [ | |
| "Improves clustering stability and convergence", | |
| "Random initialization always better", | |
| "No difference in results", | |
| "Removes noise automatically" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means++ selects initial centroids to reduce poor local minima." | |
| }, | |
| { | |
| "id": 68, | |
| "questionText": "Scenario: K-Means on customer purchase amounts. Data skewed. Solution?", | |
| "options": [ | |
| "Use raw data", | |
| "Log-transform or scale data before clustering", | |
| "Increase K", | |
| "Ignore skew" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Transforming skewed data prevents high-value points from dominating clustering." | |
| }, | |
| { | |
| "id": 69, | |
| "questionText": "Scenario: K-Means on text data after TF-IDF. Challenge?", | |
| "options": [ | |
| "Algorithm works perfectly", | |
| "High-dimensional sparse vectors; dimensionality reduction recommended", | |
| "Noise ignored", | |
| "Clusters merge randomly" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Sparse high-dimensional TF-IDF vectors may reduce clustering effectiveness without reduction." | |
| }, | |
| { | |
| "id": 70, | |
| "questionText": "Scenario: K-Means applied to IoT device readings. Best practice?", | |
| "options": [ | |
| "Increase K randomly", | |
| "Use raw readings", | |
| "Ignore convergence", | |
| "Normalize or scale features to ensure meaningful clustering" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Scaling ensures that features contribute equally to distance calculations for clustering." | |
| }, | |
| { | |
| "id": 71, | |
| "questionText": "Scenario: K-Means clustering applied to gene expression data with thousands of features. Best approach?", | |
| "options": [ | |
| "Apply PCA or feature selection to reduce dimensionality before clustering", | |
| "Randomly remove features", | |
| "Increase K arbitrarily", | |
| "Use all features directly" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "High-dimensional gene data can dilute distances; dimensionality reduction ensures meaningful clusters." | |
| }, | |
| { | |
| "id": 72, | |
| "questionText": "Scenario: K-Means with very large K relative to dataset size. Risk?", | |
| "options": [ | |
| "Clusters may be meaningless or empty", | |
| "Algorithm automatically adjusts", | |
| "Distance metric ignored", | |
| "Clusters merge automatically" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Too many clusters can lead to tiny or empty clusters with no interpretability." | |
| }, | |
| { | |
| "id": 73, | |
| "questionText": "Scenario: K-Means on data with non-uniform density clusters. Limitation?", | |
| "options": [ | |
| "Noise ignored", | |
| "Low-density clusters may merge with high-density ones", | |
| "Algorithm adjusts automatically", | |
| "Clusters always compact" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "K-Means does not handle varying densities well; denser clusters dominate centroid assignment." | |
| }, | |
| { | |
| "id": 74, | |
| "questionText": "Scenario: K-Means on highly skewed financial transaction data. Best preprocessing?", | |
| "options": [ | |
| "Apply log transformation to reduce skew before clustering", | |
| "Use raw data", | |
| "Clusters merge randomly", | |
| "Increase K arbitrarily" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Log or other transformations reduce the effect of extreme values, improving clustering quality." | |
| }, | |
| { | |
| "id": 75, | |
| "questionText": "Scenario: K-Means on time-series data. Effective method?", | |
| "options": [ | |
| "Extract meaningful features such as trends or seasonal components before clustering", | |
| "Use raw sequences directly", | |
| "Clusters merge automatically", | |
| "Increase K randomly" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Feature extraction ensures distances reflect meaningful similarities in time-series." | |
| }, | |
| { | |
| "id": 76, | |
| "questionText": "Scenario: K-Means clustering for anomaly detection in network traffic. Strategy?", | |
| "options": [ | |
| "All points treated equally", | |
| "Points far from cluster centroids are likely anomalies", | |
| "Noise ignored", | |
| "Clusters merge automatically" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Outliers distant from normal traffic clusters are potential anomalies." | |
| }, | |
| { | |
| "id": 77, | |
| "questionText": "Scenario: K-Means applied to image color compression. Challenge?", | |
| "options": [ | |
| "Algorithm automatically selects K", | |
| "All clusters identical", | |
| "Noise ignored", | |
| "Choosing optimal K to balance compression and image quality" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Selecting K is critical; too few clusters lose color details, too many reduce compression." | |
| }, | |
| { | |
| "id": 78, | |
| "questionText": "Scenario: K-Means++ vs multiple random initializations. Advantage of K-Means++?", | |
| "options": [ | |
| "Reduces likelihood of poor local minima and improves convergence", | |
| "Removes noise automatically", | |
| "Random initializations are better", | |
| "No difference in results" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means++ selects initial centroids that are distant, improving stability and cluster quality." | |
| }, | |
| { | |
| "id": 79, | |
| "questionText": "Scenario: K-Means applied to sparse TF-IDF text vectors. Best approach?", | |
| "options": [ | |
| "Use raw sparse vectors directly", | |
| "Increase K arbitrarily", | |
| "Reduce dimensionality using techniques like Truncated SVD before clustering", | |
| "Clusters merge randomly" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "High-dimensional sparse data may produce poor clusters; dimensionality reduction improves performance." | |
| }, | |
| { | |
| "id": 80, | |
| "questionText": "Scenario: K-Means clustering with overlapping spherical clusters. How to improve?", | |
| "options": [ | |
| "K-Means always works", | |
| "Clusters merge automatically", | |
| "Reduce K randomly", | |
| "Use Gaussian Mixture Models (GMM) for soft clustering" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "GMM can model cluster overlap using probability distributions, unlike hard K-Means assignments." | |
| }, | |
| { | |
| "id": 81, | |
| "questionText": "Scenario: K-Means for customer segmentation with categorical attributes. Best practice?", | |
| "options": [ | |
| "Use K-Prototypes or encode categories numerically", | |
| "Ignore categorical data", | |
| "Use standard K-Means directly", | |
| "Clusters merge randomly" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Prototypes handles mixed numerical and categorical data effectively." | |
| }, | |
| { | |
| "id": 82, | |
| "questionText": "Scenario: K-Means convergence to local minimum. Cause?", | |
| "options": [ | |
| "Distance metric incorrect", | |
| "Clusters too compact", | |
| "Poor or random initialization of centroids", | |
| "Algorithm always finds global minimum" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "K-Means may converge to suboptimal solutions depending on initial centroids." | |
| }, | |
| { | |
| "id": 83, | |
| "questionText": "Scenario: K-Means applied to geospatial clustering. Recommendation?", | |
| "options": [ | |
| "Increase K randomly", | |
| "Use appropriate distance metrics like haversine for coordinates", | |
| "Clusters merge arbitrarily", | |
| "Use Euclidean distance blindly" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Geographic distances require correct metric to ensure accurate clustering." | |
| }, | |
| { | |
| "id": 84, | |
| "questionText": "Scenario: K-Means with very large datasets. Efficient solution?", | |
| "options": [ | |
| "Use Mini-Batch K-Means", | |
| "Increase K arbitrarily", | |
| "Ignore convergence", | |
| "Use full dataset only" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Mini-Batch K-Means speeds up computation by using small random batches for centroid updates." | |
| }, | |
| { | |
| "id": 85, | |
| "questionText": "Scenario: K-Means on noisy IoT sensor data. Best preprocessing?", | |
| "options": [ | |
| "Filter or smooth noise before clustering", | |
| "Clusters merge automatically", | |
| "Increase K arbitrarily", | |
| "Use raw data" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Noise can distort centroids; preprocessing improves clustering reliability." | |
| }, | |
| { | |
| "id": 86, | |
| "questionText": "Scenario: K-Means on very high-dimensional data. Limitation?", | |
| "options": [ | |
| "Distance metrics lose meaning; dimensionality reduction recommended", | |
| "Noise ignored", | |
| "Clusters always accurate", | |
| "Algorithm adapts automatically" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "High-dimensional spaces dilute distances, leading to poor cluster assignments." | |
| }, | |
| { | |
| "id": 87, | |
| "questionText": "Scenario: K-Means for anomaly detection in healthcare data. Approach?", | |
| "options": [ | |
| "Noise ignored", | |
| "Points far from cluster centroids may indicate anomalies", | |
| "Clusters merge automatically", | |
| "All points treated equally" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Outliers distant from normal clusters can indicate anomalies or rare events." | |
| }, | |
| { | |
| "id": 88, | |
| "questionText": "Scenario: K-Means on image segmentation with varying illumination. Challenge?", | |
| "options": [ | |
| "Preprocessing like normalization is needed to reduce lighting effect", | |
| "Algorithm works perfectly", | |
| "Increase K randomly", | |
| "Clusters merge automatically" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Differences in lighting affect pixel values; normalization improves clustering consistency." | |
| }, | |
| { | |
| "id": 89, | |
| "questionText": "Scenario: K-Means for market segmentation with mixed purchase behavior. Solution?", | |
| "options": [ | |
| "Ignore categorical data", | |
| "Clusters merge randomly", | |
| "Use numerical encoding or K-Prototypes for categorical and numerical features", | |
| "Use standard K-Means directly" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Mixed data requires specialized clustering methods for meaningful segmentation." | |
| }, | |
| { | |
| "id": 90, | |
| "questionText": "Scenario: K-Means clustering produces empty clusters repeatedly. Best solution?", | |
| "options": [ | |
| "Algorithm fails automatically", | |
| "Increase K arbitrarily", | |
| "Ignore empty clusters", | |
| "Reinitialize centroids or reduce K" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Empty clusters occur when centroids have no assigned points; reinitialization or lowering K resolves this." | |
| }, | |
| { | |
| "id": 91, | |
| "questionText": "Scenario: K-Means applied to highly imbalanced datasets. Issue?", | |
| "options": [ | |
| "Large clusters may dominate, small clusters underrepresented", | |
| "Clusters always balanced", | |
| "Noise ignored", | |
| "Algorithm adapts automatically" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means does not account for cluster size; imbalance may distort results." | |
| }, | |
| { | |
| "id": 92, | |
| "questionText": "Scenario: K-Means applied to network traffic logs for intrusion detection. Best approach?", | |
| "options": [ | |
| "Use raw logs directly", | |
| "Increase K arbitrarily", | |
| "Preprocess logs into numerical features and detect points far from centroids", | |
| "Clusters merge automatically" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Transforming logs to numerical vectors enables clustering and anomaly detection." | |
| }, | |
| { | |
| "id": 93, | |
| "questionText": "Scenario: K-Means clustering with multiple valid K values. Evaluation metric?", | |
| "options": [ | |
| "Silhouette score to evaluate cluster quality", | |
| "Ignore K selection", | |
| "Use SSE only", | |
| "Clusters merge randomly" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Silhouette score measures cohesion and separation, helping choose optimal K." | |
| }, | |
| { | |
| "id": 94, | |
| "questionText": "Scenario: K-Means applied to text clustering using word embeddings. Limitation?", | |
| "options": [ | |
| "High-dimensional vectors may require dimensionality reduction or normalization", | |
| "Clusters merge randomly", | |
| "Algorithm works perfectly", | |
| "Noise ignored" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Dimensionality reduction and normalization improve clustering accuracy for embeddings." | |
| }, | |
| { | |
| "id": 95, | |
| "questionText": "Scenario: K-Means clustering results vary on repeated runs. Best solution?", | |
| "options": [ | |
| "Use K-Means++ initialization or multiple runs", | |
| "Clusters merge randomly", | |
| "Ignore variations", | |
| "Increase K arbitrarily" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Better initialization reduces sensitivity to random centroid placement." | |
| }, | |
| { | |
| "id": 96, | |
| "questionText": "Scenario: K-Means on scaled features vs unscaled features. Observation?", | |
| "options": [ | |
| "Scaling ensures fair contribution of all features to distance calculation", | |
| "Clusters merge randomly", | |
| "Algorithm adapts automatically", | |
| "Scaling is unnecessary" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Without scaling, features with larger ranges dominate cluster assignments." | |
| }, | |
| { | |
| "id": 97, | |
| "questionText": "Scenario: K-Means clustering on overlapping clusters. Alternative?", | |
| "options": [ | |
| "Reduce K randomly", | |
| "Clusters merge automatically", | |
| "Use soft clustering like Gaussian Mixture Models", | |
| "K-Means handles overlap perfectly" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Soft clustering models allow points to belong probabilistically to multiple clusters." | |
| }, | |
| { | |
| "id": 98, | |
| "questionText": "Scenario: K-Means applied to sensor network data with missing values. Solution?", | |
| "options": [ | |
| "Impute missing values before clustering", | |
| "Assign clusters randomly", | |
| "Remove entire dataset", | |
| "Ignore missing values" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "K-Means requires complete data; missing values must be handled prior to clustering." | |
| }, | |
| { | |
| "id": 99, | |
| "questionText": "Scenario: K-Means on customer behavior data with high variance features. Best approach?", | |
| "options": [ | |
| "Increase K arbitrarily", | |
| "Clusters merge randomly", | |
| "Use raw data", | |
| "Scale or normalize features to prevent dominance by high-variance features" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Scaling ensures fair contribution of each feature to distance computation." | |
| }, | |
| { | |
| "id": 100, | |
| "questionText": "Scenario: K-Means applied to a large dataset with many outliers. Recommendation?", | |
| "options": [ | |
| "Increase K arbitrarily", | |
| "Clusters merge automatically", | |
| "Use raw data directly", | |
| "Preprocess to remove or handle outliers before clustering" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Outliers distort centroids; preprocessing ensures meaningful cluster assignments." | |
| } | |
| ] | |
| } | |