如何使用词嵌入作为CRF(sklearn-crfsuite)模型训练的功能

我想开发一个NER模型,在这里我想使用词嵌入功能来训练CRF模型。没有词嵌入功能的代码可以完美工作,但是当我将嵌入作为CRF训练的功能插入时,出现错误消息。这是我的代码段的一部分:

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
#from sklearn.cross_validation import cross_val_score
#from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pickle
from gensim.models import KeyedVectors
import numpy as np
# Load vectors directly from the file
model1 = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True) ### Loading pre-trainned word2vec model
### Embedding function 
def get_features(word):
    word=word.lower()
    vectors=[]
    try:
        vectors.append(model1[word])
    except:
        pass
    #vectors=np.array(vectors)
    #vectors=vectors[0]
    return vectors

def word2features(sent,i):
    word = sent[i][0]
    wordembdding=get_features(word)   ## word embedding vector 
    wordembdding=np.array(wordembdding) ## vectors 
    #wordembdding= 
    #wordembdding=wordembdding[0]
    postag = sent[i][1]
    tag1=sent[i][2]
    tag2=sent[i][4]
    tag3 = sent[i][5]


    features = {
        'bias': 1.0,'word.lower()': word.lower(),'word[-3:]': word[-3:],'word[-2:]': word[-2:],'wordembdding': wordembdding,'word.isupper()': word.isupper(),'word.istitle()': word.istitle(),'word.isdigit()': word.isdigit(),'postag': postag,'postag[:2]': postag[:2],'tag1': tag1,'tag1[:2]': tag1[:2],'tag2': tag2,'tag2[:2]': tag2[:2],'tag3': tag3,'tag3[:2]': tag3[:2],'wordlength': len(word),'wordinitialcap': word[0].isupper(),'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,'wordallcap': len([x for x in word if x.isupper()])==len(word),'distfromsentbegin': i
    }
    if i > 0:
        word1 = sent[i-1][0]
        wordembdding1= get_features(word1)
        wordembdding1=np.array(wordembdding1)
        #wordembdding1=f2(wordembdding1)
        postag1 = sent[i-1][1]
        tag11=sent[i-1][2]
        tag22=sent[i-1][4]
        tag33 = sent[i-1][5]
        features.update({
            '-1:word.lower()': word1.lower(),'-1:word.istitle()': word1.istitle(),'-1:word.isupper()': word1.isupper(),'-1:wordembdding': wordembdding1,# word embedding features 
            '-1:postag': postag1,'-1:postag[:2]': postag1[:2],'-1:tag1': tag1,'-1:tag1[:2]': tag1[:2],'-1:tag2': tag2,'-1:tag2[:2]': tag2[:2],'-1:tag3': tag3,'-1:tag3[:2]': tag3[:2],'-1:wordlength': len(word),'-1:wordinitialcap': word[0].isupper(),'-1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,'-1:wordallcap': len([x for x in word if x.isupper()])==len(word),})
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        wordembdding1= get_features(word1)
        wordembdding1= get_features(word1)
        wordembdding1=np.array(wordembdding1) ## word embedding features 
        #wordembdding1=f2(wordembdding)
        postag1 = sent[i+1][1]
        tag11=sent[i+1][2]
        tag22=sent[i+1][4]
        tag33 = sent[i+1][5]
        features.update({
            '+1:word.lower()': word1.lower(),'+1:word.istitle()': word1.istitle(),'+1:word.isupper()': word1.isupper(),'+1:wordembdding': wordembdding1,'+1:postag': postag1,'+1:postag[:2]': postag1[:2],'+1:tag1': tag1,'+1:tag1[:2]': tag1[:2],'+1:tag2': tag2,'+1:tag2[:2]': tag2[:2],'+1:tag3': tag3,'+1:tag3[:2]': tag3[:2],'+1:wordlength': len(word),'+1:wordinitialcap': word[0].isupper(),'+1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,'+1:wordallcap': len([x for x in word if x.isupper()])==len(word),})
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent,i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,postag,tag1,label,tag2,tag3 in sent]

def sent2tokens(sent):
    return [token for token,tag3,tag4,tag5 in sent]



X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]


%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',c1=0.1,c2=0.1,max_iterations=100,all_possible_transitions=True
)
crf.fit(X_train,y_train)   ### Error message when try to train

当我想训练CRF模型时,出现以下错误消息:

TypeError: only size-1 arrays can be converted to Python scalars

有人可以建议我如何使用词嵌入矢量来训练CRF模型吗?

jian123789 回答:如何使用词嵌入作为CRF(sklearn-crfsuite)模型训练的功能

您可以阅读here,目前 python-crfsuite sklearn-crfsuite 不支持单词嵌入等数组功能。

相反,您可以将每个矢量分量作为特征传递。

{...
 'v0': 1.81583762e-02,'v1': 2.83553465e-02,...
 'v299': -4.26079705e-02,...}

我建议替换您的 get_features 功能:

def get_features(word):
    word=word.lower()
    try:
         vector=model1[word]
    except:
        # if the word is not in vocabulary,# returns zeros array
        vector=np.zeros(300,)

    return vector   

然后修改 word2features 函数,为向量的每个分量返回一个新特征:

def word2features(sent,i):
    word = sent[i][0]
    wordembdding=get_features(word)   ## word embedding vector 
    postag = sent[i][1]
    tag1=sent[i][2]
    tag2=sent[i][4]
    tag3 = sent[i][5]


    features = {
        'bias': 1.0,'word.lower()': word.lower(),'word[-3:]': word[-3:],'word[-2:]': word[-2:],'word.isupper()': word.isupper(),'word.istitle()': word.istitle(),'word.isdigit()': word.isdigit(),'postag': postag,'postag[:2]': postag[:2],'tag1': tag1,'tag1[:2]': tag1[:2],'tag2': tag2,'tag2[:2]': tag2[:2],'tag3': tag3,'tag3[:2]': tag3[:2],'wordlength': len(word),'wordinitialcap': word[0].isupper(),'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,'wordallcap': len([x for x in word if x.isupper()])==len(word),'distfromsentbegin': i
    }

    # here you add 300 features (one for each vector component)
    for iv,value in enumerate(wordembdding):
        features['v{}'.format(iv)]=value

# And so on...

两个小注释:

  • 如果您的文本中有很多单词,而这些单词不在词汇表中,则单词嵌入无法大大改善您的NER模型。也许您可以使用Fasttext(也集成在Gensim中)来正确处理看不见的单词。
  • 即使有用,为每个单词添加向量嵌入也会使您的训练集很大,训练时间长且分类器很大。
本文链接:https://www.f2er.com/3149722.html

大家都在问