1107
첫 팀활동
데이터 전처리 코드 간단하게 정리
나머지는 슬랙에 팀과 공유
import random
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings(action='ignore')
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/jeju_data
def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
seed_everything(42) # Seed 고정
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
submission_df = pd.read_csv('sample_submission.csv')
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))
test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))
#학습에 사용하지 않을 변수들을 제거합니다
train = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)'])
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']
for i in qual_col:
le = LabelEncoder()
train[i]=le.fit_transform(train[i])
print('Done.')
price = [0] * 12
for i in range(5):
for m in range(1, 13):
for y in range(2019, 2023):
if train[(train['year'] == y) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] != 0)]['price(원/kg)'].mean() != 'nan':
price[m-1] = train[(train['year'] == y) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] != 0)]['price(원/kg)'].mean()
train.loc[(train['year'] == y) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] == 0), 'price(원/kg)'] = price[m-1]
for i in range(5):
for m in train[(train['year'] == 2023) & (train['item'] == i) & (train['price(원/kg)'] == 0)]['month'].unique():
if train[(train['year'] == 2023) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] != 0)]['price(원/kg)'].mean() != 'nan':
price[m-1] = train[(train['year'] == 2023) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] != 0)]['price(원/kg)'].mean()
train.loc[(train['year'] == 2023) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] == 0), 'price(원/kg)'] = price[m-1]
# 데이터셋 아이템에 따라 달마다 평균 가격 확인
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(10,20))
ax1 = plt.subplot(5,1,1)
plt.scatter(train[train['item'] == 0]['month'], train[train['item'] == 0]['price(원/kg)'], c = 'red', alpha = 0.4)
plt.title("Accuracy Curve")
plt.xlabel("month")
plt.ylabel("price")
ax2 = plt.subplot(5,1,2,sharex=ax1)
plt.scatter(train[train['item'] == 1]['month'], train[train['item'] == 1]['price(원/kg)'], c = 'blue', alpha = 0.4)
plt.title("Accuracy Curve")
plt.xlabel("month")
plt.ylabel("price")
ax3 = plt.subplot(5,1,3,sharex=ax1)
plt.scatter(train[train['item'] == 2]['month'], train[train['item'] == 2]['price(원/kg)'], c = 'green', alpha = 0.4)
plt.title("Accuracy Curve")
plt.xlabel("month")
plt.ylabel("price")
ax4 = plt.subplot(5,1,4,sharex=ax1)
plt.scatter(train[train['item'] == 3]['month'], train[train['item'] == 3]['price(원/kg)'], c = 'yellow', alpha = 0.4)
plt.title("Accuracy Curve")
plt.xlabel("month")
plt.ylabel("price")
ax5 = plt.subplot(5,1,5,sharex=ax1)
plt.scatter(train[train['item'] == 4]['month'], train[train['item'] == 4]['price(원/kg)'], c = 'black', alpha = 0.4)
plt.title("Accuracy Curve")
plt.xlabel("month")
plt.ylabel("price")
plt.tight_layout()
plt.show()