본문 바로가기

프로그래머스 데브 코스/TIL

[6기] 프로그래머스 인공지능 데브코스 68일차 TIL

1107

첫 팀활동

데이터 전처리 코드 간단하게 정리

나머지는 슬랙에 팀과 공유
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

from google.colab import drive

drive.mount('/content/drive')

%cd /content/drive/MyDrive/jeju_data

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
submission_df = pd.read_csv('sample_submission.csv')

#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

#학습에 사용하지 않을 변수들을 제거합니다
train = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)'])

#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train[i]=le.fit_transform(train[i])

print('Done.')

price = [0] * 12

for i in range(5):
  for m in range(1, 13):
    for y in range(2019, 2023):
      if train[(train['year'] == y) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] != 0)]['price(원/kg)'].mean() != 'nan':
        price[m-1] = train[(train['year'] == y) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] != 0)]['price(원/kg)'].mean()
      train.loc[(train['year'] == y) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] == 0), 'price(원/kg)'] = price[m-1]

for i in range(5):
  for m in train[(train['year'] == 2023) & (train['item'] == i) & (train['price(원/kg)'] == 0)]['month'].unique():
    if train[(train['year'] == 2023) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] != 0)]['price(원/kg)'].mean() != 'nan':
      price[m-1] = train[(train['year'] == 2023) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] != 0)]['price(원/kg)'].mean()
    train.loc[(train['year'] == 2023) & (train['month'] == m) & (train['item'] == i) & (train['price(원/kg)'] == 0), 'price(원/kg)'] = price[m-1]




# 데이터셋 아이템에 따라 달마다 평균 가격 확인

import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(10,20))

ax1 = plt.subplot(5,1,1)
plt.scatter(train[train['item'] == 0]['month'], train[train['item'] == 0]['price(원/kg)'], c = 'red', alpha = 0.4)
plt.title("Accuracy Curve")
plt.xlabel("month")
plt.ylabel("price")

ax2 = plt.subplot(5,1,2,sharex=ax1)
plt.scatter(train[train['item'] == 1]['month'], train[train['item'] == 1]['price(원/kg)'], c = 'blue', alpha = 0.4)
plt.title("Accuracy Curve")
plt.xlabel("month")
plt.ylabel("price")

ax3 = plt.subplot(5,1,3,sharex=ax1)
plt.scatter(train[train['item'] == 2]['month'], train[train['item'] == 2]['price(원/kg)'], c = 'green', alpha = 0.4)
plt.title("Accuracy Curve")
plt.xlabel("month")
plt.ylabel("price")

ax4 = plt.subplot(5,1,4,sharex=ax1)
plt.scatter(train[train['item'] == 3]['month'], train[train['item'] == 3]['price(원/kg)'], c = 'yellow', alpha = 0.4)
plt.title("Accuracy Curve")
plt.xlabel("month")
plt.ylabel("price")

ax5 = plt.subplot(5,1,5,sharex=ax1)
plt.scatter(train[train['item'] == 4]['month'], train[train['item'] == 4]['price(원/kg)'], c = 'black', alpha = 0.4)
plt.title("Accuracy Curve")
plt.xlabel("month")
plt.ylabel("price")

plt.tight_layout()
plt.show()