我正在做一个小型的机器学习项目。我在一个数据框中收集了该公司在S&P500中的所有股票行情及其每日价值。问题是当我调用函数
时do_ml('BAC')
我收到此错误:
Traceback (most recent call last):
File "c:\Users\giuli\Desktop\UDEMY COURSES\finance\Git_rep\processing _for_ml.py",line 87,in <module>
do_ml('BAC')
File "c:\Users\giuli\Desktop\UDEMY COURSES\finance\Git_rep\processing _for_ml.py",line 72,in do_ml
X,y,df = extract_featuresets(ticker)
File "c:\Users\giuli\Desktop\UDEMY COURSES\finance\Git_rep\processing _for_ml.py",line 46,in extract_featuresets
df['{}_d1'.format(ticker)],File "C:\Python37\lib\site-packages\pandas\core\frame.py",line 2980,in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Python37\lib\site-packages\pandas\core\indexes\base.py",line 2899,in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx",line 107,in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx",line 126,line 152,in pandas._libs.index.IndexEngine._get_loc_duplicates
File "pandas\_libs\index.pyx",line 169,in pandas._libs.index.IndexEngine._maybe_get_bool_indexer
KeyError: 'BAC_d1'
Thiese是与该函数有关的所有函数:
def remove_extra_characters(df):
df.rename(columns={old:old.strip() for old in df.columns},inplace=True)
def process_data_for_labels(ticker):
hm_days=7
df = pd.read_csv('sp500_joined_closes.csv',index_col = 0 )
tickers = df.columns.values.tolist()
df.fillna(0,inplace=True)
remove_extra_characters(df)
for i in range(1,hm_days+1):
df['{}_{}d'.format(ticker,i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
print(df)
df.fillna(0,inplace=True)
return tickers,df
def buy_sell_hold(*args):
cols = [c for c in args]
requirements = 0.02
for col in cols:
if col > requirements:
return 1
if col < requirements:
return -1
return 0
def extract_featuresets(ticker):
tickers,df = process_data_for_labels(ticker)
df['{}_target'.format(ticker)] = list(map( buy_sell_hold,df['{}_d1'.format(ticker)],df['{}_d2'.format(ticker)],df['{}_d3'.format(ticker)],df['{}_d4'.format(ticker)],df['{}_d5'.format(ticker)],df['{}_d6'.format(ticker)],df['{}_d7'.format(ticker)]))
vals = df['{}target'.format(ticker)].values.tolist()
str_vals= [str(i) for i in vals]
print('Data spread:',Counter(str_vals))
df.fillna(0,inplace=True)
df = df.replace([np.inf,-np.inf],np.nan)
df.dropna(inplace=True)
df_vals = df[[ticker for ticker in tickers]].pct_change()
df_vals = df_vals.replace([np.inf,0)
df_vals.fillna(0,inplace=True)
X = df_vals.values #Percent change data of comanies.
y = df['{}target'.format(ticker)].values #target
return X,df
def do_ml(ticker):
X,df = extract_featuresets(ticker)
#setting sets for training and testing giving it 25% of the size each.
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,test_size = 0.25)
#selecting classifier
clf = neighbors.KNeighborsClassifier()
#fitting data in classifier
clf.fit(X_train,y_train)
confidence = clf.score(X_test,y_test)
predictions = clf.predict(X_test)
print('Predicted spread:',Counter(predictions))
return confidence
问题肯定在extract_feature函数中。我在这里所做的基本上是在我的数据框中创建7列,每个列都包含一个值,该值试图预测当日的价格变化(因此,数据框将显示下周每一天的价格变化)