我正在尝试在Google计算引擎中运行以下代码:
import itertools
import math
import pandas as pd
import os
import numpy as np
ner_df = pd.read_csv('ner_dataset.csv',encoding = 'ISO-8859-1')
sentences_words = []
sentences_tags = []
curr_sent_num = -1
current_sentence_words = []
current_sentence_tags = []
for sent_num,word,tag in ner_df[['Sentence #','Word','Tag']].values:
if isinstance(sent_num,str) and 'Sentence: ' in sent_num:
curr_sent_num = int(sent_num.split(':')[1].strip())
if current_sentence_words and current_sentence_tags:
sentences_words.append(current_sentence_words)
sentences_tags.append(current_sentence_tags)
current_sentence_words = []
current_sentence_tags = []
current_sentence_words.append(word)
current_sentence_tags.append(tag)
len(sentences_words),len(sentences_tags)
train_size = int(len(sentences_words) * 0.8)
train_sentences_words = sentences_words[:train_size]
train_sentences_tags = sentences_tags[:train_size]
test_sentences_words = sentences_words[train_size:]
test_sentences_tags = sentences_tags[train_size:]
print('Train:',len(train_sentences_words),len(train_sentences_tags))
print('Test:',len(test_sentences_words),len(test_sentences_tags))
vocab = set(itertools.chain(*[[w for w in s] for s in train_sentences_words]))
tags = set(itertools.chain(*[[w for w in s] for s in train_sentences_tags]))
sentenecs_lens = map(len,train_sentences_words)
print(len(vocab),len(tags),len(list(sentenecs_lens)))
MAX_LEN = 75#max(sentenecs_lens)
VOCAB_SIZE = len(vocab)
print('VOCAB_SIZE:',VOCAB_SIZE)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
words_tokenizer = Tokenizer(num_words=VOCAB_SIZE,filters=[],oov_token='__UNKNOWN__')
words_tokenizer.fit_on_texts(map(lambda s: ' '.join(s),train_sentences_words))
word_index = words_tokenizer.word_index
word_index['__PADDING__'] = 0
index_word = {i:w for w,i in word_index.items()}
print('Unique tokens:',len(word_index))
train_sequences = words_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s),train_sentences_words))
test_sequences = words_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s),test_sentences_words))
train_sequences_padded = pad_sequences(train_sequences,maxlen=MAX_LEN)
test_sequences_padded = pad_sequences(test_sequences,maxlen=MAX_LEN)
print(train_sequences_padded.shape,test_sequences_padded.shape)
tags_tokenizer = Tokenizer(num_words=len(tags),filters='',oov_token='__UNKNOWN__',lower=False)
tags_tokenizer.fit_on_texts(map(lambda s: ' '.join(s),train_sentences_tags))
tag_index = tags_tokenizer.word_index
tag_index['__PADDING__'] = 0
index_tag = {i:w for w,i in tag_index.items()}
index_tag_wo_padding = dict(index_tag)
index_tag_wo_padding[tag_index['__PADDING__']] = '0'
print('Unique tags:',len(tag_index))
train_tags = tags_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s),train_sentences_tags))
test_tags = tags_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s),test_sentences_tags))
train_tags_padded = pad_sequences(train_tags,maxlen=MAX_LEN)
test_tags_padded = pad_sequences(test_tags,maxlen=MAX_LEN)
train_tags_padded = np.expand_dims(train_tags_padded,-1)
test_tags_padded = np.expand_dims(test_tags_padded,-1)
print(train_tags_padded.shape,test_tags_padded.shape)
for w,t in zip(train_sequences_padded[123],train_tags_padded[123]):
print(index_word[w],index_tag[t[0]])
from keras.layers import Dense,Input,LSTM,Embedding,Bidirectional,Dropout
from keras.models import Model
from keras.initializers import Constant
embeddings = {}
with open('glove.6B.50d.txt') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:],dtype='float32')
embeddings[word] = coefs
print('# vectors:',len(embeddings))
# prepare embedding matrix
num_words = min(VOCAB_SIZE,len(word_index) + 1)
embedding_matrix = np.zeros((num_words,50))
for word,i in word_index.items():
if i >= VOCAB_SIZE:
continue
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
pretrained_embedding_layer = Embedding(VOCAB_SIZE,50,embeddings_initializer=Constant(embedding_matrix),input_length=MAX_LEN,trainable=False)
sequence_input = Input(shape=(MAX_LEN,),dtype='int32')
embedded_sequences = pretrained_embedding_layer(sequence_input)
...
我在最后一条语句中遇到以下错误:
TypeError:急于执行不支持形状的tf.constant(值包含1441550个元素,形状为(31815,50)包含1590750个元素)。
但是,我可以在colab中毫无问题地运行代码。 我该如何解决这个问题?