Bike
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
데이터 전처리¶
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
print(test.head(2))
print(train.info())
print()
print(train.head(2))
print(train.info())
print()
datetime season holiday workingday weather temp atemp \ 0 2011-01-20 00:00:00 1 0 1 1 10.66 11.365 1 2011-01-20 01:00:00 1 0 1 1 10.66 13.635 humidity windspeed 0 56 26.0027 1 56 0.0000 <class 'pandas.core.frame.DataFrame'> RangeIndex: 10886 entries, 0 to 10885 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 10886 non-null object 1 season 10886 non-null int64 2 holiday 10886 non-null int64 3 workingday 10886 non-null int64 4 weather 10886 non-null int64 5 temp 10886 non-null float64 6 atemp 10886 non-null float64 7 humidity 10886 non-null int64 8 windspeed 10886 non-null float64 9 casual 10886 non-null int64 10 registered 10886 non-null int64 11 count 10886 non-null int64 dtypes: float64(3), int64(8), object(1) memory usage: 1020.7+ KB None datetime season holiday workingday weather temp atemp \ 0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 humidity windspeed casual registered count 0 81 0.0 3 13 16 1 80 0.0 8 32 40 <class 'pandas.core.frame.DataFrame'> RangeIndex: 10886 entries, 0 to 10885 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 10886 non-null object 1 season 10886 non-null int64 2 holiday 10886 non-null int64 3 workingday 10886 non-null int64 4 weather 10886 non-null int64 5 temp 10886 non-null float64 6 atemp 10886 non-null float64 7 humidity 10886 non-null int64 8 windspeed 10886 non-null float64 9 casual 10886 non-null int64 10 registered 10886 non-null int64 11 count 10886 non-null int64 dtypes: float64(3), int64(8), object(1) memory usage: 1020.7+ KB None
print(train['datetime'][0])
print(train['datetime'][0].split()[0].split('-')[2])
print(train['datetime'][0].split()[1].split(':')[2])
print(train['datetime'][0].split()[1].split(':')[2])
2011-01-01 00:00:00 01 00 00
from datetime import datetime
import calendar
train['year']=train['datetime'].apply(lambda x: datetime.strptime(x.split()[0], '%Y-%m-%d').year)
train['month']=train['datetime'].apply(lambda x: datetime.strptime(x.split()[0], '%Y-%m-%d').month)
train['day']=train['datetime'].apply(lambda x: datetime.strptime(x.split()[0], '%Y-%m-%d').day)
train['hour']=train['datetime'].apply(lambda x: x.split()[1].split(':')[0])
train['minute']=train['datetime'].apply(lambda x: x.split()[1].split(':')[1])
train['second']=train['datetime'].apply(lambda x: x.split()[1].split(':')[2])
train['weekday']=train['datetime'].apply(lambda x: datetime.strptime(x.split()[0], '%Y-%m-%d').weekday())
train.head(3)
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | year | month | day | hour | minute | second | weekday | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 | 2011 | 1 | 1 | 00 | 00 | 00 | 5 |
1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 | 2011 | 1 | 1 | 01 | 00 | 00 | 5 |
2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 | 2011 | 1 | 1 | 02 | 00 | 00 | 5 |
train['season']=train['season'].map({1:'Spring',
2: 'Summer',
3: 'Fall',
4: 'Winter'})
train['weather']=train['weather'].map({1:'Clear',
2: 'Mist, few clouds',
3: 'Light snow',
4: 'Heavy snow'})
train.head(3)
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | year | month | day | hour | minute | second | weekday | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | Spring | 0 | 0 | Clear | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 | 2011 | 1 | 1 | 00 | 00 | 00 | 5 |
1 | 2011-01-01 01:00:00 | Spring | 0 | 0 | Clear | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 | 2011 | 1 | 1 | 01 | 00 | 00 | 5 |
2 | 2011-01-01 02:00:00 | Spring | 0 | 0 | Clear | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 | 2011 | 1 | 1 | 02 | 00 | 00 | 5 |
데이터 조사¶
데이터의 신뢰성(windspeed
)과 상관관계가 적절치 않을때(minutes,seconds
) feature를 선택하는 것은 어렵다.
import seaborn as sns
import matplotlib as mpl
mpl.rc('font',size=14)
figure, axes = plt.subplots(nrows=1, ncols=2)
plt.tight_layout()
figure.set_size_inches(12,5)
sns.histplot(train['count'], ax=axes[0])
sns.histplot(np.log(train['count']),ax=axes[1])
<AxesSubplot:xlabel='count', ylabel='Count'>
mpl.rc('font',size=14)
figure, axes = plt.subplots(nrows=3, ncols=2)
plt.tight_layout()
figure.set_size_inches(10,9)
sns.barplot(x='year',y='count', data=train, ax=axes[0,0])
sns.barplot(x='month',y='count', data=train, ax=axes[0,1])
sns.barplot(x='day',y='count', data=train, ax=axes[1,0])
sns.barplot(x='hour',y='count', data=train, ax=axes[1,1])
sns.barplot(x='minute',y='count', data=train, ax=axes[2,0])
sns.barplot(x='second',y='count', data=train, ax=axes[2,1])
axes[0,0].set(title='year')
axes[1,0].tick_params(axis='x', labelrotation=90)
axes[1,1].tick_params(axis='x', labelrotation=90)
mpl.rc('font',size=14)
figure, axes = plt.subplots(nrows=2, ncols=3)
plt.tight_layout()
figure.set_size_inches(15,10)
sns.boxplot(x='weather',y='count', data=train, ax=axes[0,0])
axes[0,0].tick_params(axis='x', labelrotation=10)
sns.boxplot(x='season',y='count', data=train,ax=axes[0,1])
axes[1,0].tick_params(axis='x', labelrotation=10)
sns.boxplot(x='holiday',y='count', data=train,ax=axes[0,2])
sns.pointplot(x='hour',y='count', data=train,hue='workingday', ax=axes[1,0])
sns.pointplot(x='hour',y='count', data=train,hue='weekday',ax=axes[1,1])
sns.regplot(x='temp',y='count', data=train,line_kws={'color':'red'},ax=axes[1,2])
<AxesSubplot:xlabel='temp', ylabel='count'>
figure,axes=plt.subplots(nrows=1,ncols=2)
figure.set_size_inches(12,5)
#sns.jointplot(x='temp',y='count', data=train,kind='reg',line_kws={'color':'red'},ax=axes[0])
sns.regplot(x='temp',y='count', data=train,line_kws={'color':'red'},ax=axes[0])
#sns.jointplot(x='windspeed',y='count', data=train,kind='reg',line_kws={'color':'red'},ax=axes[1])
sns.regplot(x='windspeed',y='count', data=train,line_kws={'color':'red'},ax=axes[1])
<AxesSubplot:xlabel='windspeed', ylabel='count'>
correlation matrix를 통해 feature를 선택하는 것은 좋은 방법이다. 피어슨 상관계수를 사용하면 두 변수가 선형적으로 상관되어 있는지를 알 수 있다. 상관계수는 -1과 1사이의 값을 가지며, 1에 가까울수록 강한 양의 상관관계를 가지고, -1에 가까울수록 강한 음의 상관관계를 가진다. 상관계수가 0에 가까울수록 두 변수는 서로 독립적이다.
sns.heatmap(train[['count','temp','atemp','humidity','windspeed']].corr(), annot=True, cmap='RdYlGn', linewidths=0.2)
<AxesSubplot:>
데이터셋 재로드, 분석 준비¶
train=pd.read_csv('train_data.csv')
test=pd.read_csv('test_data.csv')
df=pd.concat([train,test],ignore_index=True)
유효한시간 year
, month
, hour
, weekday
weather
중 4
는 한건으로 outliner으로 분석대상에서 제외
관련성이 적은 factor casual
, registered
, datetime
, windspeed
중복성있는 요소 season | month
df['year']=df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').year)
df['month']=df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').month)
df['hour']=df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').hour)
df['weekday']=df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').weekday())
df.head(3)
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | year | month | hour | weekday | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2012-06-03 10:00:00 | 2 | 0 | 0 | 1 | 25.42 | 31.06 | 43 | 15.0013 | 127 | 277 | 404 | 2012 | 6 | 10 | 6 |
1 | 2012-08-11 10:00:00 | 3 | 0 | 0 | 1 | 29.52 | 34.09 | 62 | 6.0032 | 125 | 282 | 407 | 2012 | 8 | 10 | 5 |
2 | 2011-01-05 23:00:00 | 1 | 0 | 1 | 1 | 8.20 | 12.88 | 47 | 0.0000 | 1 | 18 | 19 | 2011 | 1 | 23 | 2 |
df=df[df['weather']!=4]
df.drop(['datetime','casual','registered','windspeed'],axis=1,inplace=True)
데이터 셋 분리¶
X_train=df[0:8707]
X_test=df[8707:]
y_train=X_train['count']
y_test=X_test['count']
del X_train['count']
del X_test['count']
RMSLE 적용 준비¶
$\sqrt{\frac{1}{n}\sum_{i=1}^{n}(\log(y_i+1)-\log(\hat{y_i}+1))^2}$
def RMSLE(y, pred, convertExp):
#if log value exponentiate it, else pass
if convertExp:
y=np.exp(y)
pred=np.exp(pred)
#convert NaN to 0 after log converting
log_y=np.nan_to_num(np.log(y+1))
log_pred=np.nan_to_num(np.log(pred+1))
#RMSLE compute
output=np.sqrt(np.mean(np.square(log_y-log_pred)))
return output
GD¶
$$w_{i+1}=w_i+a\frac{\partial}{\partial w}(\Delta MSE)$$$w_{i+1} : 구하고자하는 직선의 기울기 \\ a : Learning Rate$
$$trainY=\theta_0 + \theta_1 x_1 + \theta_2 x_2 + \theta_3 x_3\\일때\; Y,x_n은\; 알고있는값,\; \theta_n 는\; 구하려는\; 값\;으로$$$$predY=\theta_0 + \theta_1 x_1 + \theta_2 x_2 + \theta_3 x_3\\일때\; x_n,\theta_n은\; 알고있는값,\; Y는\; 구하려는\; 값\;으로 $$$ Y=종속변수(targetValue)\\ \theta=회귀계수(weight)\\ x=독립변수(feature)\\ $
from sklearn.linear_model import LinearRegression, Ridge, Lasso
Lin=LinearRegression()
log_y_train=np.log(y_train)
log_y_test=np.log(y_test)
Lin.fit(X_train,log_y_train)
print(f'선형회귀 결과의 계수 {Lin.coef_}')
print()
print(f'선형회귀 결과의 상수항 {Lin.intercept_}')
선형회귀 결과의 계수[-0.0607321 0.06509771 -0.00879684 -0.00356091 0.01314642 0.03130814 -0.01555035 0.43602723 0.08079056 0.10418806 0.02540429] 선형회귀 결과의 계수-874.2092765565683
Lin.predict(X_test)
array([5.56497568, 3.55244034, 4.10531686, ..., 5.64101879, 3.44636883, 6.32838637])
print(RMSLE(log_y_train,Lin.predict(X_train),True))
print(RMSLE(log_y_test,Lin.predict(X_test),True))
1.0194650914142418 1.016633868776997
plt.figure(figsize=(10,5))
x=np.linspace(0,7)
y=x
lines1=plt.plot(Lin.predict(X_train),log_y_train,'o',x,y,'red')
plt.setp(lines1[0],markersize=1)
plt.setp(lines1[1],linewidth=2)
plt.xlabel('Predicted')
plt.ylabel('Actual')
lines2=plt.plot(Lin.predict(X_test),log_y_test,'o',x,y,'green')
plt.setp(lines2[0],markersize=1)
plt.setp(lines2[1],linewidth=2)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
댓글남기기