我正在尝试使用kmeans对句子进行聚类
但没有正确输入cluster()
的输入类型。
我尝试将列表Y与使用wordembedding
def创建的sent_vectorizer
一起使用,并且还尝试了dataframe
版本的Y。
def sent_vectorizer(sent,model): #creates vectors for each tokenized sentence
sent_vec =[]
numw = 0
for w in sent:
try:
if numw == 0:
sent_vec = model[w]
else:
sent_vec = np.add(sent_vec,model[w]) #adds vectors of all words in a sentence over iterations
numw+=1 #counts the number of words in all sentences
except:
pass
return np.asarray(sent_vec) / numw
Y=[]
for sentence in all_words:
Y.append(sent_vectorizer(sentence,model))
print ("========================")
print (Y)
df_Y = pd.DataFrame(Y)
NUM_CLUSTERS=3
kclusterer = KMeansClusterer(NUM_CLUSTERS,distance=nltk.cluster.util.cosine_distance,repeats=25,avoid_empty_clusters=True)
assigned_clusters = kclusterer.cluster(df_Y,assign_clusters=True)
print (assigned_clusters)
all_words有一个标记化句子的列表:
[['cloud','technologies'],['still','building','strong','foundation','game','changers','hyper','converged','technology','sd','wan','tying','colleges','together','one','seamless','security','plane','protecting','college','data'],['none'],['data','analytics'],['customer','experience','improvements'],['ar','vr'],['none','timeframe','longer','term','cloud','services','ai','technologies','changer'],['cloud','finance','integrated','ship','management'],['microservices','based','api','platform','omni','channels'],['moving','erp','cloud'],['online','learning','open','educational','resources'],'programs','meet','needs','today','learners'],['automation','existing','processes']]
错误回溯:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-108-68f5fe386b54> in <module>
1 NUM_CLUSTERS=3
2 kclusterer = KMeansClusterer(NUM_CLUSTERS,avoid_empty_clusters=True)
----> 3 assigned_clusters = kclusterer.cluster(df_Y,assign_clusters=True)
4 print (assigned_clusters)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\cluster\util.py in cluster(self,vectors,assign_clusters,trace)
60
61 # call abstract method to cluster the vectors
---> 62 self.cluster_vectorspace(vectors,trace)
63
64 # assign the vectors to clusters
~\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\cluster\kmeans.py in cluster_vectorspace(self,trace)
99 # effect the distance comparison)
100 for means in meanss:
--> 101 means.sort(key=sum)
102
103 # find the set of means that's minimally different from the others
TypeError: 'float' object is not iterable ```
I also tried the following code and get error there as well:
```
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(Y)
```
the error in this case is:
```
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-48-229383cd99be> in <module>
1 kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
----> 2 kmeans.fit(Y)
3
4 labels = kmeans.labels_
5 centroids = kmeans.cluster_centers_
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\cluster\k_means_.py in fit(self,X,y,sample_weight)
969 tol=self.tol,random_state=random_state,copy_x=self.copy_x,970 n_jobs=self.n_jobs,algorithm=self.algorithm,--> 971 return_n_iter=True)
972 return self
973
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\cluster\k_means_.py in k_means(X,n_clusters,sample_weight,init,precompute_distances,n_init,max_iter,verbose,tol,random_state,copy_x,n_jobs,algorithm,return_n_iter)
309 order = "C" if copy_x else None
310 X = check_array(X,accept_sparse='csr',dtype=[np.float64,np.float32],--> 311 order=order,copy=copy_x)
312 # verify that the number of samples given is larger than k
313 if _num_samples(X) < n_clusters:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array,accept_sparse,accept_large_sparse,dtype,order,copy,force_all_finite,ensure_2d,allow_nd,ensure_min_samples,ensure_min_features,warn_on_dtype,estimator)
525 try:
526 warnings.simplefilter('error',ComplexWarning)
--> 527 array = np.asarray(array,dtype=dtype,order=order)
528 except ComplexWarning:
529 raise ValueError("Complex data not supported\n"
~\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\numeric.py in asarray(a,order)
536
537 """
--> 538 return array(a,copy=False,order=order)
539
540
ValueError: setting an array element with a sequence.