DATA_STUDY

데이터 분석_라이브러리_함수_모음_

datawithu 2023. 11. 1. 22:49

기본 라이브러리 임포트 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

데이터 불러오기

df = pd.read_csv('')
df = pd.read_excel('')
dj = pd.read_json('')
dx = pd.read_xml('')

데이터 탐색하기

df.head()
df.tail()
df.info()
df.describe()
df.index
df.columns
df.values

결측치 확인하기

df.isna().sum()
df.isnull().sum()

특정 열의 값들의 건수/비율 확인하기

df['A'].value_counts()
df['B'].value_counts(normalize=True)

  특정 열 삭제

cols = ['A','B','C']
df = df.drop(cols, axis=1)
#또는
df.drop(cols, axis=1, inplace=True)

결측치 대체

df.replace('변경전' ,'변경후', inplace=True )
#리스트 딕셔너리도 가능함
df.replace( { '전':'후', '전2':'후2'  }, inplace=True  )

결측치 채우기

df.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None)

df.fillna('A')
df.fillna(value={'col1':'A','col2':'B'})
df.fillna(method='bfill')
f.fillna(method='ffill')
median()
max()
min()
std()

데이터 타입에 따른 컬럼명 

obj = df.select_dtypes(include='object').columns
obj_x = df.select_dtypes(exclude='object').columns

 원핫엔코딩/라벨엔코딩

from sklearn.preprocessing import LabelEncoder
L_en = LabelEncoder()
df['C'] = L_en.fit_transform(df['C'])

from sklearn.preprocessing import  OneHotEncoder
O_en = OneHotEncoder()
df['D'] = O_en.fit_transform(df['D'])

get_dummies

 cols = ['','']
 df_dummy =  pd.get_dummies(df, columns=cols) #drop_first=True옵션도 있음

 


그래프

#히스토그램
sns.histplot(x='', hue='',data=df)

#kde플롯
sns.kdeplot(df['A'])

#
sns.countplot(x='', data=df)

# 막대그래프
df['A'].value_counts().plot(kind='bar')

#히트맵
heat = df[['A','B','C']].corr()
sns.heatmap( heat, annot=True, cmap='Blues' )#fmt='d,

#박스
sns.boxplot(x= '', y= '', data=df )

데이터 분리하기

data = pd.read_csv('')
x_data = data.drop(target, axis = 1)
y_data = data.loc[:, target]

데이터 분리하기

from sklearn.model_selection import train_test_split

x_train,x_test, y_train,y_test = train_test_split(x_data, y_data,test_size=, stratify=y_data, random_state= )

데이터 표준화/정규화

# 둘 중 하나만 쓸것 

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

머신러닝 모델링 

1 ) 분류 라이브러리 & 모델 선언

#한개만 사용하기
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter= ,  C= )

from sklearn.neighbors import KNeighborsClassifier
moedel = KNeighborsClassifier(n_neighbors=)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=, random_state=)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=, random_state=)

from xgboost import XGBClassifier
model = XGBClassifier(n_estimators= )

from lightgbm import LGBMClassifier
model = LGBMClassifier(n_estimators=)

1) 분류 모델학습

model.fit(X_train, y_train)

1) 분류 성능평가

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report

y_pred = model.predict(x_test)

#결과
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

1) 분류 히트맵 

heat = confusion_matrix(y_test, y_pred)

sns.heatmap(heat, annot=True, fmt='d',color='Blues')

2 ) 회귀 라이브러리 & 모델 선언

#한개만 사용하기
from sklearn.linear_model import LinearRegression
model = LinearRegression(  )

from sklearn.neighbors import KNeighborsRegressor
moedel = KNeighborsRegressor(n_neighbors=)

from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

from xgboost import XGBRegressor
model = XGBRegressor( )

from lightgbm import LGBMRegressor
model = LGBMRegressor()

2) 회귀 학습

model.fit(X_train, y_train)

3) 회귀 성능평가 

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

y_pred = model.predict(x_test)

#결과
print(mean_squared_error(y_test, y_pred, squared=False)) # RMSE를 반환
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))


# 회귀계수 = 가중치 확인 : 
model.coef_

# 편향(y절편) = 
model.intercept_

딥러닝 모델링

 

라이브러리 불러오기

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, Dropout,Input,Activation, BatchNormalization,Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.backend import clear_session
clear_session()

x_train :모델 입력값 확인하기

x_train.shape

y_train :  아웃풋값 확인하기

y_train.shape

1) DNN 모델링

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(x인풋값,)))
#model1.add(Flatten()) # 없어도 됨..
model.add(Dropout(0.25))
#model1.add( BatchNormalization( )) #없어도 됨..
model.add(Dense(아웃풋, activation='sigmoid')) # 2진분류,  다중분류= softmax

es = EarlyStopping( monitor='val_loss', 
                   patience=0, 
                   mode='min',
                   verbose=1 ) 
ms = ModelCheckpoint( 'my_checkpoint.ckpt',  # best_model.h5의 경우에는
                     monitor='val_loss',
                     mode='min', 
                     verbose=1,   # verbose=True 옵션과 동일
                     save_weights_only=True,) #save_best_only 라고 해야 작동되었음

model.compile(optimizer='adam', loss='binary_crossentropy') #이진분류 metrics=['accuracy']
#다중 분류의 경우  loss='sparse_categorical_crossentropy' #다중손실함수...
# 회귀는 loss='mse'

history = model.fit(x_train, y_train, 
                    epochs=,
                    batch_size=,
                    validation_data=(x_test,y_test),
                    callbacks=[es,ms],
                    verbose=True
                   ).history
model.summary()

2) 성능평가

#이진분류
y_pred = model.predict(x_test)
y_pred_x = np.where(y_pred>=0.5, 1,0 )
print(classification_report(y_test, y_pred_x))

#다중분류
y_pred = model.predict(x_test)
y_pred_x= np.argmax(y_pred,axis=1)
print(classification_report(y_test, y_pred_x))

 

3) 학습/검정손실 그래프 

plt.plot(history['loss'], label='Train_Loss')
plt.plot(history['val_loss'], label='Validation_Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

 

다들 화이팅입니다


추가) 어떻게 머신러닝 모델을  최적화 할까요?

그것은 바로바로  여러 모델을 테스트 하고, 최적의 파라미터를 찾는 겁니다.

 본 모델들은 회귀 모델을 기준으로 작성되었습니다.

1. 모델 불러오기 & 그리드서치 & 성능평가

# 불러오기
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

1) Linear regression

# 선언하기
model = LinearRegression()
# 성능예측
cv_score = cross_val_score(model, x_train, y_train, cv=5)
# 결과확인
print('평균:', cv_score.mean())

# 결과수집
result = {}
result['Linear Regression'] = cv_score.mean()

2) KNN

# 선언하기
model = KNeighborsRegressor()
# 성능예측
cv_score = cross_val_score(model, x_train_s, y_train, cv=5)
# 결과확인
print('평균:', cv_score.mean())
# 결과수집
result['KNN'] = cv_score.mean()

3)Decision Tree

# 선언하기
model = DecisionTreeRegressor(random_state=1)
# 성능예측
cv_score = cross_val_score(model, x_train, y_train, cv=5)
# 결과확인
print('평균:', cv_score.mean())
# 결과수집
result['Decision Tree'] = cv_score.mean()

4) RandomForest

# 선언하기
model = RandomForestRegressor(random_state=1)
# 성능예측
cv_score = cross_val_score(model, x_train, y_train, cv=5)
# 결과확인
print('평균:', cv_score.mean())
# 결과수집
result['Random Forest'] = cv_score.mean()

5)XGBoost

# 선언하기
model = XGBRegressor(random_state=1)
# 성능예측
cv_score = cross_val_score(model, x_train, y_train, cv=5)
# 결과확인
print('평균:', cv_score.mean())
# 결과수집
result['XGBoost'] = cv_score.mean()

6) LightGBM

# 선언하기
model = LGBMRegressor(random_state=1, verbose=-100)
# 성능예측
cv_score = cross_val_score(model, x_train, y_train, cv=5)
# 결과확인
print('평균:', cv_score.mean())
# 결과수집
result['LightGBM'] = cv_score.mean()

2.  모델들의 성능 평가하기 

# 성능 비교
print('=' * 40)
for m_name, score in result.items():
    print(m_name, score.round(3))
print('=' * 40)

##시각화
# 성능 시각화 비교
plt.barh(list(result.keys()), result.values())
plt.show()

3.성능 튜닝하기 

- 위에서 성능이 가장 좋은 모델을 가지고 튜닝을 합니다

# 기본 모델 선언
model_rfr = RandomForestRegressor(random_state=1)

# 파라미터 지정
  # max_depth: range(1, 21)
param = {'max_depth': range(1, 21)}

# 모델 선언
model = GridSearchCV(model_rfr,
                     param,
                     cv=5,
                     scoring='r2')
                     
# 학습하기 (시간 많이 걸릴수도 있다용)
model.fit(x_train, y_train)

최적의 파라미터를 알려줘

# 최적 파라미터, 예측 최고 성능
print(model.best_params_)
print(model.best_score_)

변수 중요도

# 변수 중요도 시각화
plt.figure(figsize=(6, 5))
plt.barh(list(x), model.best_estimator_.feature_importances_)
plt.show()

4. 최적 파라미터가 적용된 모델의 성능 평가

# 예측하기
y_pred = model.predict(x_test)
# 성능평가
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

 

 

다들 건승!