ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • 데이터 분석_라이브러리_함수_모음_
    DATA_STUDY 2023. 11. 1. 22:49

    기본 라이브러리 임포트 

    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt

    데이터 불러오기

    df = pd.read_csv('')
    df = pd.read_excel('')
    dj = pd.read_json('')
    dx = pd.read_xml('')

    데이터 탐색하기

    df.head()
    df.tail()
    df.info()
    df.describe()
    df.index
    df.columns
    df.values

    결측치 확인하기

    df.isna().sum()
    df.isnull().sum()

    특정 열의 값들의 건수/비율 확인하기

    df['A'].value_counts()
    df['B'].value_counts(normalize=True)

      특정 열 삭제

    cols = ['A','B','C']
    df = df.drop(cols, axis=1)
    #또는
    df.drop(cols, axis=1, inplace=True)

    결측치 대체

    df.replace('변경전' ,'변경후', inplace=True )
    #리스트 딕셔너리도 가능함
    df.replace( { '전':'후', '전2':'후2'  }, inplace=True  )

    결측치 채우기

    df.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None)
    
    df.fillna('A')
    df.fillna(value={'col1':'A','col2':'B'})
    df.fillna(method='bfill')
    f.fillna(method='ffill')
    median()
    max()
    min()
    std()

    데이터 타입에 따른 컬럼명 

    obj = df.select_dtypes(include='object').columns
    obj_x = df.select_dtypes(exclude='object').columns

     원핫엔코딩/라벨엔코딩

    from sklearn.preprocessing import LabelEncoder
    L_en = LabelEncoder()
    df['C'] = L_en.fit_transform(df['C'])
    
    from sklearn.preprocessing import  OneHotEncoder
    O_en = OneHotEncoder()
    df['D'] = O_en.fit_transform(df['D'])

    get_dummies

     cols = ['','']
     df_dummy =  pd.get_dummies(df, columns=cols) #drop_first=True옵션도 있음

     


    그래프

    #히스토그램
    sns.histplot(x='', hue='',data=df)
    
    #kde플롯
    sns.kdeplot(df['A'])
    
    #
    sns.countplot(x='', data=df)
    
    # 막대그래프
    df['A'].value_counts().plot(kind='bar')
    
    #히트맵
    heat = df[['A','B','C']].corr()
    sns.heatmap( heat, annot=True, cmap='Blues' )#fmt='d,
    
    #박스
    sns.boxplot(x= '', y= '', data=df )

    데이터 분리하기

    data = pd.read_csv('')
    x_data = data.drop(target, axis = 1)
    y_data = data.loc[:, target]

    데이터 분리하기

    from sklearn.model_selection import train_test_split
    
    x_train,x_test, y_train,y_test = train_test_split(x_data, y_data,test_size=, stratify=y_data, random_state= )

    데이터 표준화/정규화

    # 둘 중 하나만 쓸것 
    
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(X_train)
    x_test = scaler.transform(X_test)
    
    
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    x_train = scaler.fit_transform(X_train)
    x_test = scaler.transform(X_test)

    머신러닝 모델링 

    1 ) 분류 라이브러리 & 모델 선언

    #한개만 사용하기
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(max_iter= ,  C= )
    
    from sklearn.neighbors import KNeighborsClassifier
    moedel = KNeighborsClassifier(n_neighbors=)
    
    from sklearn.tree import DecisionTreeClassifier
    model = DecisionTreeClassifier(max_depth=, random_state=)
    
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=, random_state=)
    
    from xgboost import XGBClassifier
    model = XGBClassifier(n_estimators= )
    
    from lightgbm import LGBMClassifier
    model = LGBMClassifier(n_estimators=)

    1) 분류 모델학습

    model.fit(X_train, y_train)

    1) 분류 성능평가

    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from sklearn.metrics import confusion_matrix 
    from sklearn.metrics import classification_report
    
    y_pred = model.predict(x_test)
    
    #결과
    print(accuracy_score(y_test, y_pred))
    print(recall_score(y_test, y_pred))
    print(precision_score(y_test, y_pred))

    1) 분류 히트맵 

    heat = confusion_matrix(y_test, y_pred)
    
    sns.heatmap(heat, annot=True, fmt='d',color='Blues')

    2 ) 회귀 라이브러리 & 모델 선언

    #한개만 사용하기
    from sklearn.linear_model import LinearRegression
    model = LinearRegression(  )
    
    from sklearn.neighbors import KNeighborsRegressor
    moedel = KNeighborsRegressor(n_neighbors=)
    
    from sklearn.tree import DecisionTreeRegressor
    model = DecisionTreeRegressor()
    
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor()
    
    from xgboost import XGBRegressor
    model = XGBRegressor( )
    
    from lightgbm import LGBMRegressor
    model = LGBMRegressor()

    2) 회귀 학습

    model.fit(X_train, y_train)

    3) 회귀 성능평가 

    from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
    
    y_pred = model.predict(x_test)
    
    #결과
    print(mean_squared_error(y_test, y_pred, squared=False)) # RMSE를 반환
    print(mean_absolute_error(y_test, y_pred))
    print(r2_score(y_test, y_pred))
    
    
    # 회귀계수 = 가중치 확인 : 
    model.coef_
    
    # 편향(y절편) = 
    model.intercept_

    딥러닝 모델링

     

    라이브러리 불러오기

    import tensorflow as tf
    from tensorflow import keras
    
    from tensorflow.keras.models import Sequential, load_model, Model
    from tensorflow.keras.layers import Dense, Dropout,Input,Activation, BatchNormalization,Flatten
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
    from tensorflow.keras.utils import to_categorical
    
    from tensorflow.keras.backend import clear_session
    clear_session()

    x_train :모델 입력값 확인하기

    x_train.shape

    y_train :  아웃풋값 확인하기

    y_train.shape

    1) DNN 모델링

    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(x인풋값,)))
    #model1.add(Flatten()) # 없어도 됨..
    model.add(Dropout(0.25))
    #model1.add( BatchNormalization( )) #없어도 됨..
    model.add(Dense(아웃풋, activation='sigmoid')) # 2진분류,  다중분류= softmax
    
    es = EarlyStopping( monitor='val_loss', 
                       patience=0, 
                       mode='min',
                       verbose=1 ) 
    ms = ModelCheckpoint( 'my_checkpoint.ckpt',  # best_model.h5의 경우에는
                         monitor='val_loss',
                         mode='min', 
                         verbose=1,   # verbose=True 옵션과 동일
                         save_weights_only=True,) #save_best_only 라고 해야 작동되었음
    
    model.compile(optimizer='adam', loss='binary_crossentropy') #이진분류 metrics=['accuracy']
    #다중 분류의 경우  loss='sparse_categorical_crossentropy' #다중손실함수...
    # 회귀는 loss='mse'
    
    history = model.fit(x_train, y_train, 
                        epochs=,
                        batch_size=,
                        validation_data=(x_test,y_test),
                        callbacks=[es,ms],
                        verbose=True
                       ).history
    model.summary()

    2) 성능평가

    #이진분류
    y_pred = model.predict(x_test)
    y_pred_x = np.where(y_pred>=0.5, 1,0 )
    print(classification_report(y_test, y_pred_x))
    
    #다중분류
    y_pred = model.predict(x_test)
    y_pred_x= np.argmax(y_pred,axis=1)
    print(classification_report(y_test, y_pred_x))

     

    3) 학습/검정손실 그래프 

    plt.plot(history['loss'], label='Train_Loss')
    plt.plot(history['val_loss'], label='Validation_Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

     

    다들 화이팅입니다


    추가) 어떻게 머신러닝 모델을  최적화 할까요?

    그것은 바로바로  여러 모델을 테스트 하고, 최적의 파라미터를 찾는 겁니다.

     본 모델들은 회귀 모델을 기준으로 작성되었습니다.

    1. 모델 불러오기 & 그리드서치 & 성능평가

    # 불러오기
    from sklearn.linear_model import LinearRegression
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from xgboost import XGBRegressor
    from lightgbm import LGBMRegressor
    
    from sklearn.model_selection import cross_val_score, GridSearchCV
    from sklearn.metrics import mean_absolute_error, r2_score

    1) Linear regression

    # 선언하기
    model = LinearRegression()
    # 성능예측
    cv_score = cross_val_score(model, x_train, y_train, cv=5)
    # 결과확인
    print('평균:', cv_score.mean())
    
    # 결과수집
    result = {}
    result['Linear Regression'] = cv_score.mean()

    2) KNN

    # 선언하기
    model = KNeighborsRegressor()
    # 성능예측
    cv_score = cross_val_score(model, x_train_s, y_train, cv=5)
    # 결과확인
    print('평균:', cv_score.mean())
    # 결과수집
    result['KNN'] = cv_score.mean()

    3)Decision Tree

    # 선언하기
    model = DecisionTreeRegressor(random_state=1)
    # 성능예측
    cv_score = cross_val_score(model, x_train, y_train, cv=5)
    # 결과확인
    print('평균:', cv_score.mean())
    # 결과수집
    result['Decision Tree'] = cv_score.mean()

    4) RandomForest

    # 선언하기
    model = RandomForestRegressor(random_state=1)
    # 성능예측
    cv_score = cross_val_score(model, x_train, y_train, cv=5)
    # 결과확인
    print('평균:', cv_score.mean())
    # 결과수집
    result['Random Forest'] = cv_score.mean()

    5)XGBoost

    # 선언하기
    model = XGBRegressor(random_state=1)
    # 성능예측
    cv_score = cross_val_score(model, x_train, y_train, cv=5)
    # 결과확인
    print('평균:', cv_score.mean())
    # 결과수집
    result['XGBoost'] = cv_score.mean()

    6) LightGBM

    # 선언하기
    model = LGBMRegressor(random_state=1, verbose=-100)
    # 성능예측
    cv_score = cross_val_score(model, x_train, y_train, cv=5)
    # 결과확인
    print('평균:', cv_score.mean())
    # 결과수집
    result['LightGBM'] = cv_score.mean()

    2.  모델들의 성능 평가하기 

    # 성능 비교
    print('=' * 40)
    for m_name, score in result.items():
        print(m_name, score.round(3))
    print('=' * 40)
    
    ##시각화
    # 성능 시각화 비교
    plt.barh(list(result.keys()), result.values())
    plt.show()

    3.성능 튜닝하기 

    - 위에서 성능이 가장 좋은 모델을 가지고 튜닝을 합니다

    # 기본 모델 선언
    model_rfr = RandomForestRegressor(random_state=1)
    
    # 파라미터 지정
      # max_depth: range(1, 21)
    param = {'max_depth': range(1, 21)}
    
    # 모델 선언
    model = GridSearchCV(model_rfr,
                         param,
                         cv=5,
                         scoring='r2')
                         
    # 학습하기 (시간 많이 걸릴수도 있다용)
    model.fit(x_train, y_train)

    최적의 파라미터를 알려줘

    # 최적 파라미터, 예측 최고 성능
    print(model.best_params_)
    print(model.best_score_)

    변수 중요도

    # 변수 중요도 시각화
    plt.figure(figsize=(6, 5))
    plt.barh(list(x), model.best_estimator_.feature_importances_)
    plt.show()

    4. 최적 파라미터가 적용된 모델의 성능 평가

    # 예측하기
    y_pred = model.predict(x_test)
    # 성능평가
    print(mean_absolute_error(y_test, y_pred))
    print(r2_score(y_test, y_pred))

     

     

    다들 건승!


     

     

     

     

     

     

     

     

     

     

     

Designed by Tistory.