본문 바로가기

프로그래머스 데브 코스/TIL

[6기] 프로그래머스 인공지능 데브코스 44일차 TIL

1014

[8주차 - Day3] ML_basics - 실습

모델 학습

과제 진행
df_dataset = df_del[['total_items', 'subtotal', 'num_distinct_items',
                     'min_item_price', 'max_item_price', 'total_onshift', 'total_busy',
                     'total_outstanding_orders', 'delivery_time',
                     'estimated_store_to_consumer_driving_duration']]
df_dataset


from sklearn.model_selection import train_test_split

train_dataset, val_dataset = train_test_split(df_dataset, test_size=0.1)

X_train = train_dataset.drop(labels='delivery_time',axis=1)
y_train = train_dataset['delivery_time']
X_val = val_dataset.drop(labels='delivery_time',axis=1)
y_val = val_dataset['delivery_time']

print('학습 이미지:', X_train.shape)
print('학습 레이블:', y_train.shape)
print('검증 이미지:', X_val.shape)
print('검증 레이블:', y_val.shape)


X_train


from sklearn.linear_model import LinearRegression
import time

start_time = time.time() # 시작 시간

model = LinearRegression()
model.fit(X_train, y_train)

print("소요된 시간(초 단위):", time.time() - start_time) # 실행 시간


# - 학습데이터 평가
score = model.score(X_train, y_train)
print(f'Train : {score}')

# - 테스트데이터 평가
score = model.score(X_val, y_val)
print(f'Test : {score}')


y_predict = model.predict(X_val)

plt.scatter(y_val, y_predict, alpha=0.4)
plt.xlabel("Actual time")
plt.ylabel("Predicted time")
plt.title("MULTIPLE LINEAR REGRESSION")
plt.show()


plt.figure(figsize=(20,8))
df_del.boxplot()
plt.xticks(rotation=45)
plt.show()


df_del = df_del[df_del['delivery_time'] <= 7200]   # 전처리도 같이 진행

plt.figure(figsize=(20,8))
df_del.boxplot()
plt.xticks(rotation=45)
plt.show()


df_del = df_del[df_del['subtotal'] <= 20000]
df_del = df_del[df_del['min_item_price'] <= 10000]
df_del = df_del[df_del['max_item_price'] <= 10000]

plt.figure(figsize=(20,8))
df_del.boxplot()
plt.xticks(rotation=45)
plt.show()


plt.figure(figsize=(20,8))
df_del[['subtotal', 'min_item_price', 'max_item_price']].boxplot()
plt.xticks(rotation=45)
plt.show()


plt.figure(figsize=(20,8))
df_del[['total_items', 'num_distinct_items', 'total_onshift', 'total_busy',
        'total_outstanding_orders']].boxplot()
plt.xticks(rotation=45)
plt.show()


df_del = df_del[df_del['total_items'] <= 100]

plt.figure(figsize=(20,8))
df_del[['total_items', 'num_distinct_items', 'total_onshift', 'total_busy',
        'total_outstanding_orders']].boxplot()
plt.xticks(rotation=45)
plt.show()


plt.figure(figsize=(20,8))
df_del[['estimated_order_place_duration',
        'estimated_store_to_consumer_driving_duration']].boxplot()
plt.xticks(rotation=45)
plt.show()


df_del = df_del[df_del['estimated_order_place_duration'] <= 2500]

plt.figure(figsize=(20,8))
df_del[['estimated_order_place_duration',
        'estimated_store_to_consumer_driving_duration']].boxplot()
plt.xticks(rotation=45)
plt.show()


df_del


df_del.reset_index(drop=False, inplace=True)
df_del = df_del.drop(labels=['index'],axis=1)
df_del


df_dataset = df_del[['total_items', 'subtotal', 'num_distinct_items',
                     'min_item_price', 'max_item_price', 'total_onshift',
                     'total_busy', 'total_outstanding_orders',
                     'estimated_store_to_consumer_driving_duration', 'delivery_time']]
df_dataset


from sklearn.model_selection import train_test_split

train_dataset, val_dataset = train_test_split(df_dataset, test_size=0.1)

X_train = train_dataset.drop(labels='delivery_time',axis=1)
y_train = train_dataset['delivery_time']
X_val = val_dataset.drop(labels='delivery_time',axis=1)
y_val = val_dataset['delivery_time']

print('학습 이미지:', X_train.shape)
print('학습 레이블:', y_train.shape)
print('검증 이미지:', X_val.shape)
print('검증 레이블:', y_val.shape)


X_train


from sklearn.linear_model import LinearRegression
import time

start_time = time.time() # 시작 시간

model = LinearRegression()
model.fit(X_train, y_train)

print("소요된 시간(초 단위):", time.time() - start_time) # 실행 시간


# - 학습데이터 평가
score = model.score(X_train, y_train)
print(f'Train : {score}')

# - 테스트데이터 평가
score = model.score(X_val, y_val)
print(f'Test : {score}')


y_predict = model.predict(X_val)

plt.scatter(y_val, y_predict, alpha=0.4)
plt.xlabel("Actual time")
plt.ylabel("Predicted time")
plt.title("MULTIPLE LINEAR REGRESSION")
plt.show()

from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# 예제 데이터 생성
X = df_dataset.drop(labels='delivery_time',axis=1)
y = df_dataset['delivery_time']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# SGDRegressor 모델 생성
model = SGDRegressor(alpha=0.01, max_iter=1000, random_state=42)

# 모델 학습
model.fit(X_train, y_train)

# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)

# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("MSE:", mse)
print("R-squared:", r2)



#



num_iterations = 100
mse_scores = []
r2_scores = []

for i in range(num_iterations):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    # 성능 지표를 리스트에 추가
    mse_scores.append(mse)
    r2_scores.append(r2)

# 평균 성능 출력
average_mse = np.mean(mse_scores)
average_r2 = np.mean(r2_scores)

print("평균 MSE:", average_mse)
print("평균 R-squared:", average_r2)




#




from sklearn.linear_model import Ridge

# 다항 회귀 적용
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)

model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)

# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("MSE:", mse)
print("R-squared:", r2)





#




from sklearn.linear_model import Lasso

X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)

model = Lasso(alpha=1.0)
model.fit(X_train, y_train)

# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)

# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("MSE:", mse)
print("R-squared:", r2)





#




X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)

model = Lasso(alpha=0.01)
model.fit(X_train, y_train)

# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)

# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("MSE:", mse)
print("R-squared:", r2)





#




from sklearn.model_selection import GridSearchCV

X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)

# Ridge 모델 생성
model = Lasso()

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
}

# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 및 모델 출력
best_alpha = grid_search.best_params_['alpha']
best_model = Ridge(alpha=best_alpha)
best_model.fit(X_train, y_train)

print("Best alpha:", best_alpha)
print("Best model:", best_model)




#




from sklearn.model_selection import GridSearchCV

X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)

# Ridge 모델 생성
model = Ridge()

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
}

# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 및 모델 출력
best_alpha = grid_search.best_params_['alpha']
best_model = Ridge(alpha=best_alpha)
best_model.fit(X_train, y_train)

print("Best alpha:", best_alpha)
print("Best model:", best_model)





#




from sklearn.model_selection import GridSearchCV

X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)

# Ridge 모델 생성
model = Ridge()

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'alpha': [100, 1000, 10000],
}

# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 및 모델 출력
best_alpha = grid_search.best_params_['alpha']
best_model = Ridge(alpha=best_alpha)
best_model.fit(X_train, y_train)

print("Best alpha:", best_alpha)
print("Best model:", best_model)



#



from sklearn.model_selection import GridSearchCV

# 예제 데이터 생성
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)

model = Lasso(alpha=0.1)
model.fit(X_train, y_train)

# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)

# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("MSE:", mse)
print("R-squared:", r2)

print('학습 평가 : ', model.score(X_train, y_train))
print('검증 평가 : ', model.score(X_val, y_val))





#




from sklearn.model_selection import GridSearchCV

# 예제 데이터 생성
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)

# Ridge 모델 생성
#model = Lasso()
model = Ridge(alpha=1000)
model.fit(X_train, y_train)

# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)

# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("MSE:", mse)
print("R-squared:", r2)

print('학습 평가 : ', model.score(X_train, y_train))
print('검증 평가 : ', model.score(X_val, y_val))




#  가중치 확인하면서 정확도 높일 예정