defkmeans(X, n_cluster, random_seed=2, n_init=100): ''' Function calculates the centroids after performing k-means on the given dataset. Function returns two values new calculated centers and labels for each datapoint. If we have n_cluster = 4 then labels from algorithm will correspond to values 0,1,2 and 3 Args: X: np.array representing set of input data n_cluster: number of clusters to use for clustering random_seed: random seed to use for calling random function in numpy n_inint: max number of iterations to use for k-means Returns: centers: np.array representing the centers for n_clusters labels: np.array containing a label for each datapoint in X ''' # The code below is inspired and refered to the pseudocode in lecture "Clustering" centers = np.zeros((n_cluster,X.shape[1])) labels = np.zeros_like(X) # YOUR CODE HERE labels = np.zeros(X.shape[0])
#randomly pick the first center initial_centers_ind[0] = np.random.randint(X.shape[0])
for i in range(0, n_cluster-1): sum_distance = 0 flag = 0 #compute distances from i_th center to all other points. for j in range(0, len(X)): tmpMax = 0 distances[j] = np.linalg.norm(X[j] - X[int(initial_centers_ind[i])]) sum_distance += distances[j]
# pick the next center based on the length of distance. longer distance has higher probility random_loc = np.random.uniform(0,sum_distance) for j in range(0, len(X)): flag += distances[j] if flag > random_loc andnot(j in initial_centers_ind): initial_centers_ind[i+1] = j #pick j as the data index of the next center break print ('initial centers:', initial_centers_ind)
# initialise centers for i in range(len(centers)): centers[i] = X[int(initial_centers_ind[i])]
for i in range(0, n_init): # for each point, assign them to the nearest center for j in range(0, len(X)): distance = np.inf for k in range(0, len(centers)): if np.linalg.norm(X[j]-centers[k]) < distance: labels[j] = k distance = np.linalg.norm(X[j]-centers[k]) # for each point group, calculate the new center for k in range(0, len(centers)): num_point = 0 sum_point = np.zeros_like(X[0]) for j in range(0, len(X)): if k == labels[j]: sum_point = sum_point + X[j] num_point = num_point + 1 centers[k] = sum_point / num_point
return centers,labels
## change the parameters of the function call to test your implementation centers, labels = kmeans(X[:,:2],n_cluster=6, random_seed=5, n_init=100)