# #StackBounty: #python #statistics Cross-validating Fixed-effect Model

### Bounty: 50

Code Summary

The following code is a data science script I’ve been working on that cross-validates a fixed effect model. I’m moving from R to Python and would appreciate feedback on the code below.

The code does the following:

1. Split data into train and test using a custom function that groups/clusters the data

2. Estimate a linear fixed effect model with train and test data

3. Calculate RMSE and tstat to verify independence of residuals

4. Prints RMSE, SE, and tstat from cross-validation exercise.

Note: the code downloads a remote data set, so the code can be run on its own.

Code

``````from urllib import request
from scipy import stats
import pandas as pd
import numpy as np
import statsmodels.api as sm

print("Defining functions......")

def main():
"""
Estimate baseline and degree day regression.

Returns:
data.frame with RMSE, SE, and tstats
"""
print("https://github.com/johnwoodill/corn_yield_pred/raw/master/data/full_data.pickle")
file_url = "https://github.com/johnwoodill/corn_yield_pred/raw/master/data/full_data.pickle"
request.urlretrieve(file_url, "full_data.pickle")

# Baseline WLS Regression Cross-Validation with FE and trends
print("Estimating Baseline Regression")
basedat = cropdat[['ln_corn_yield', 'trend', 'trend_sq', 'corn_acres']]
fe_group = pd.get_dummies(cropdat.fips)
regdat = pd.concat([basedat, fe_group], axis=1)
base_rmse, base_se, base_tstat = felm_cv(regdat, cropdat['trend'])

# Degree Day Regression Cross-Validation
print("Estimating Degree Day Regression")
dddat = cropdat[['ln_corn_yield', 'dday0_10C', 'dday10_30C', 'dday30C',
'prec', 'prec_sq', 'trend', 'trend_sq', 'corn_acres']]
fe_group = pd.get_dummies(cropdat.fips)
regdat = pd.concat([dddat, fe_group], axis=1)
ddreg_rmse, ddreg_se, ddreg_tstat = felm_cv(regdat, cropdat['trend'])

# Get results as data.frame
fdat = {'Regression': ['Baseline', 'Degree Day',],
'RMSE': [base_rmse, ddreg_rmse],
'se': [base_se, ddreg_se],
't-stat': [base_tstat, ddreg_tstat]}

fdat = pd.DataFrame(fdat, columns=['Regression', 'RMSE', 'se', 't-stat'])

# Calculate percentage change
fdat['change'] = (fdat['RMSE'] - fdat['RMSE'].iloc[0])/fdat['RMSE'].iloc[0]
return fdat

def felm_rmse(y_train, x_train, weights, y_test, x_test):
"""
Estimate WLS from y_train, x_train, predict using x_test, calculate RMSE,
and test whether residuals are independent.

Arguments:
y_train: Dep variable - Full or training data
x_train: Covariates - Full or training data
weights: Weights for WLS
y_test: Dep variable - test data
x_test: Covariates - test data

Returns:
Returns tuple with RMSE and tstat from ttest
"""
# Fit model and get predicted values of test data
mod = sm.WLS(y_train, x_train, weights=weights).fit()
pred = mod.predict(x_test)

#Get residuals from test data
res = (y_test[:] - pred.values)

# Calculate ttest to check residuals from test and train are independent
t_stat = stats.ttest_ind(mod.resid, res, equal_var=False)[0]

# Return RMSE and t-stat from ttest
return (np.sqrt(np.mean(res**2)), t_stat)

def gc_kfold_cv(data, group, begin, end):
"""
Custom group/cluster data split for cross-validation of panel data.
(Ensure groups are clustered and train and test residuals are independent)

Arguments:
data:     data to filter with 'trend'
group:    group to cluster
begin:    start of cluster
end:      end of cluster

Return:
Return test and train data for Group-by-Cluster Cross-validation method
"""
# Get group data
data = data.assign(group=group.values)

# Filter test and train based on begin and end
test = data[data['group'].isin(range(begin, end))]
train = data[~data['group'].isin(range(begin, end))]

# Return train and test
dfs = {}
tsets = [train, test]

# Combine train and test to return dfs
for i, val in enumerate([1, 2]):
dfs[val] = tsets[i]

return dfs

def felm_cv(regdata, group):
"""
Cross-validate WLS FE model

Arguments:
regdata:  regression data
group:    group fixed effect

Returns:
return mean RMSE, standard error, and mean tstat from ttest
"""
# Loop through 1-31 years with 5 groups in test set and 26 train set
#i = 1
#j = False
retrmse = []
rettstat = []
#for j, val in enumerate([1, 27]):
for j in range(1, 28):
# Get test and training data
tset = gc_kfold_cv(regdata, group, j, j + 4)

# Separate y_train, x_train, y_test, x_test, and weights
y_train = tset[1].ln_corn_yield
x_train = tset[1].drop(['ln_corn_yield', 'corn_acres'], 1)
weights = tset[1].corn_acres
y_test = tset[2].ln_corn_yield
x_test = tset[2].drop(['ln_corn_yield', 'corn_acres'], 1)

# Get RMSE and tstat from train and test data
inrmse, t_stat = felm_rmse(y_train, x_train, weights, y_test, x_test)

# Append RMSE and tstats to return
retrmse.append(inrmse)
rettstat.append(t_stat)

# If end of loop return mean RMSE, s.e., and tstat
if j == 27:
return (np.mean(retrmse), np.std(retrmse), np.mean(t_stat))

if __name__ == "__main__":
RDAT = main()
print(RDAT)

# print results
print("---Results--------------------------------------------")
print("Baseline: ", round(RDAT.iloc[0, 1], 2), "(RMSE)",
round(RDAT.iloc[0, 2], 2), "(se)",
round(RDAT.iloc[0, 1], 3), "(t-stat)")
print("Degree Day: ", round(RDAT.iloc[1, 1], 2), "(RMSE)",
round(RDAT.iloc[0, 2], 2), "(se)",
round(RDAT.iloc[1, 3], 2), "(t-stat)")
print("------------------------------------------------------")
print("% Change from Baseline: ", round(RDAT.iloc[1, 4], 4)*100, "%")
print("------------------------------------------------------")
``````

Get this bounty!!!

This site uses Akismet to reduce spam. Learn how your comment data is processed.