我想开发一个NER模型,在这里我想使用词嵌入功能来训练CRF模型。没有词嵌入功能的代码可以完美工作,但是当我将嵌入作为CRF训练的功能插入时,出现错误消息。这是我的代码段的一部分:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from itertools import chain
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
#from sklearn.cross_validation import cross_val_score
#from sklearn.grid_search import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pickle
from gensim.models import KeyedVectors
import numpy as np
# Load vectors directly from the file
model1 = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True) ### Loading pre-trainned word2vec model
### Embedding function
def get_features(word):
word=word.lower()
vectors=[]
try:
vectors.append(model1[word])
except:
pass
#vectors=np.array(vectors)
#vectors=vectors[0]
return vectors
def word2features(sent,i):
word = sent[i][0]
wordembdding=get_features(word) ## word embedding vector
wordembdding=np.array(wordembdding) ## vectors
#wordembdding=
#wordembdding=wordembdding[0]
postag = sent[i][1]
tag1=sent[i][2]
tag2=sent[i][4]
tag3 = sent[i][5]
features = {
'bias': 1.0,'word.lower()': word.lower(),'word[-3:]': word[-3:],'word[-2:]': word[-2:],'wordembdding': wordembdding,'word.isupper()': word.isupper(),'word.istitle()': word.istitle(),'word.isdigit()': word.isdigit(),'postag': postag,'postag[:2]': postag[:2],'tag1': tag1,'tag1[:2]': tag1[:2],'tag2': tag2,'tag2[:2]': tag2[:2],'tag3': tag3,'tag3[:2]': tag3[:2],'wordlength': len(word),'wordinitialcap': word[0].isupper(),'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,'wordallcap': len([x for x in word if x.isupper()])==len(word),'distfromsentbegin': i
}
if i > 0:
word1 = sent[i-1][0]
wordembdding1= get_features(word1)
wordembdding1=np.array(wordembdding1)
#wordembdding1=f2(wordembdding1)
postag1 = sent[i-1][1]
tag11=sent[i-1][2]
tag22=sent[i-1][4]
tag33 = sent[i-1][5]
features.update({
'-1:word.lower()': word1.lower(),'-1:word.istitle()': word1.istitle(),'-1:word.isupper()': word1.isupper(),'-1:wordembdding': wordembdding1,# word embedding features
'-1:postag': postag1,'-1:postag[:2]': postag1[:2],'-1:tag1': tag1,'-1:tag1[:2]': tag1[:2],'-1:tag2': tag2,'-1:tag2[:2]': tag2[:2],'-1:tag3': tag3,'-1:tag3[:2]': tag3[:2],'-1:wordlength': len(word),'-1:wordinitialcap': word[0].isupper(),'-1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,'-1:wordallcap': len([x for x in word if x.isupper()])==len(word),})
else:
features['BOS'] = True
if i < len(sent)-1:
word1 = sent[i+1][0]
wordembdding1= get_features(word1)
wordembdding1= get_features(word1)
wordembdding1=np.array(wordembdding1) ## word embedding features
#wordembdding1=f2(wordembdding)
postag1 = sent[i+1][1]
tag11=sent[i+1][2]
tag22=sent[i+1][4]
tag33 = sent[i+1][5]
features.update({
'+1:word.lower()': word1.lower(),'+1:word.istitle()': word1.istitle(),'+1:word.isupper()': word1.isupper(),'+1:wordembdding': wordembdding1,'+1:postag': postag1,'+1:postag[:2]': postag1[:2],'+1:tag1': tag1,'+1:tag1[:2]': tag1[:2],'+1:tag2': tag2,'+1:tag2[:2]': tag2[:2],'+1:tag3': tag3,'+1:tag3[:2]': tag3[:2],'+1:wordlength': len(word),'+1:wordinitialcap': word[0].isupper(),'+1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,'+1:wordallcap': len([x for x in word if x.isupper()])==len(word),})
else:
features['EOS'] = True
return features
def sent2features(sent):
return [word2features(sent,i) for i in range(len(sent))]
def sent2labels(sent):
return [label for token,postag,tag1,label,tag2,tag3 in sent]
def sent2tokens(sent):
return [token for token,tag3,tag4,tag5 in sent]
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]
%%time
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',c1=0.1,c2=0.1,max_iterations=100,all_possible_transitions=True
)
crf.fit(X_train,y_train) ### Error message when try to train
当我想训练CRF模型时,出现以下错误消息:
TypeError: only size-1 arrays can be converted to Python scalars
有人可以建议我如何使用词嵌入矢量来训练CRF模型吗?