import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GaussianMixture
import numpy as np
from sklearn.model_selection import ParameterGrid, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV,HalvingGridSearchCV,StratifiedKFold,RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
from utils import flatten_categorical_columns, reverse_get_dummies
from catboost import CatBoostClassifier




def split_data(df,only_quality_cut=True,validation=False):

    if only_quality_cut:
        y = df['QUALITY_CUT']  # La colonna target
    else:
        y = df['DEFECT_TYPE']  # La colonna target
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    if validation:
      X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.1,stratify=y)
    else:
      X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2,stratify=y)
    return X_train, X_test, y_train, y_test


def catboost_classifier(X):
  #X=pd.read_excel('outlier_detection_output/final_data_cleaned.xlsx')
  X=X.sample(frac=1.0, random_state=42)
  X_train, X_test, _, y_test = split_data(X, only_quality_cut=False)
  X_train, X_val, y_train, y_val = split_data(X_train, only_quality_cut=False, validation=True)
  X_train = X_train.copy().drop(["DEFECT_TYPE"], axis=1)
  X_test = X_test.copy().drop(["DEFECT_TYPE"], axis=1)
  X_val = X_val.copy().drop(["DEFECT_TYPE"], axis=1)
  param_grid = {
    'iterations': [200, 300,500],
    'depth': [4, 6, 8,12],
    'learning_rate': [0.01,0.05,0.1],
    'l2_leaf_reg': [3, 5, 10],
    'bootstrap_type': ['Bayesian', 'Bernoulli'],
    'grow_policy': ['SymmetricTree', 'Depthwise'],
    'min_data_in_leaf': [5,10,15],
}
  fit_params = {
    'eval_set': (X_val, y_val),
    'early_stopping_rounds': 50,
    'use_best_model': True,
}
  """
  cat_model = CatBoostClassifier(cat_features=['NOZZLE_TYPE','MATERIAL_NAME_TULUS','THICKNESS_TULUS [mm]'],random_state=42, verbose=0)
  grid_search = RandomizedSearchCV(estimator=cat_model,
                                   n_iter=50,  # Numero di combinazioni da testare
                           param_distributions=param_grid,
                           cv=3,
                           scoring='f1_weighted',
                           n_jobs=-1,verbose=2,
                           random_state=42,)
  grid_search.fit(X_train, y_train,**fit_params)
  print("Miglior parametri:", grid_search.best_params_)
  print("Miglior punteggio F1:", grid_search.best_score_)
  print("Miglior punteggio F1 sul test set:", grid_search.score(X_test, y_test))
  """

  classes = ['0', '1', '2', '3', '4']
  metrics = ['precision', 'recall', 'f1-score', 'support']
  report_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
  report_train_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
  n=10
  for i in tqdm(range(n)):
    X_train, X_test, y_train, y_test = split_data(X, only_quality_cut=False)
    X_train = X_train.copy().drop(["DEFECT_TYPE"], axis=1)
    X_test = X_test.copy().drop(["DEFECT_TYPE"], axis=1)
    cat_col_indices=['NOZZLE_TYPE','MATERIAL_NAME_TULUS']
    model = CatBoostClassifier(min_data_in_leaf=10,learning_rate=0.05,l2_leaf_reg=5,iterations=500,grow_policy='Depthwise',depth=8,bootstrap_type='Bernoulli', verbose=0)
    model.fit(X_train, y_train, cat_features=cat_col_indices, verbose=0)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    report = classification_report(y_test, y_pred_test, output_dict=True, zero_division=0.0)
    report_train = classification_report(y_train, y_pred_train, output_dict=True, zero_division=0.0)
    for cls in classes:
        for m in metrics:
            report_accumulator[cls][m] += report.get(cls, {}).get(m, 0.0)

    for cls in classes:
        for m in metrics:
            report_train_accumulator[cls][m] += report_train.get(cls, {}).get(m, 0.0)

  report_avg = {
      cls: {m: report_accumulator[cls][m] / n for m in metrics}
      for cls in classes
  }
  report_train_avg = {
      cls: {m: report_train_accumulator[cls][m] / n for m in metrics}
      for cls in classes
  }


  print(f"\nNumber of training samples:{len(X_train)}")
  print(f"Number of test samples:{len(X_test)}")
  print("Cat Boost Test Mean Class Accuracy: \n ")
  for cls in classes:
    acc=f'class: {cls} , '
    for m in metrics:
        val = report_avg[cls][m]

        if m != 'support':
            acc=acc+f'{m}: {val:.2f} , '
        else:
            acc=acc+f'{m}: {int(val)}'
    print(acc)
  print("\n CatBoost Train Mean Class Accuracy: \n ")
  for cls in classes:
    acc=f'class: {cls} , '
    for m in metrics:
        val = report_train_avg[cls][m]

        if m != 'support':
            acc=acc+f'{m}: {val:.2f} , '
        else:
            acc=acc+f'{m}: {int(val)}'
    print(acc)


def mlp_classifier(X,X_train, X_test, y_train, y_test):
  continous_features=["THICKNESS_TULUS [mm]","CONTOUR_SPEED [mm/min]","LASER_POWER [W]","CONTOUR_GAS_PRESSURE [bar]","CONTOUR_NOZZLE_DISTANCE [mm]","CONTOUR_FOCAL [mm]"]
  classes = ['0', '1', '2', '3', '4']  # Sostituisci con le tue vere classi come stringhe
  metrics = ['precision', 'recall', 'f1-score', 'support']
  report_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
  report_train_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
  n=100
  for i in range(n):
    X_train, X_test, y_train, y_test = split_data(X, only_quality_cut=False)
    X_train = X_train.copy().drop(["DEFECT_TYPE"], axis=1)
    X_test = X_test.copy().drop(["DEFECT_TYPE"], axis=1)
    
    scaler = StandardScaler()  # Oppure MinMaxScaler()
    scaled_continuous_train = pd.DataFrame(scaler.fit_transform(X_train[continous_features]), columns=continous_features,index=X_train.index)
    scaled_continuous_test = pd.DataFrame(scaler.transform(X_test[continous_features]), columns=continous_features,index=X_test.index)
    non_continuous_train = X_train[[col for col in X_train.columns if col not in continous_features]]
    X_train_scaled = pd.concat([scaled_continuous_train, non_continuous_train], axis=1)
    non_continuous_test = X_test[[col for col in X_test.columns if col not in continous_features]]
    X_test_scaled = pd.concat([scaled_continuous_test, non_continuous_test], axis=1)
    mlp = MLPClassifier(random_state=42, max_iter=300)
    best_params={'activation': 'relu', 'alpha': 0.001, 'batch_size': 32, 'early_stopping': True, 'hidden_layer_sizes': (50, 30), 'learning_rate': 'constant', 'learning_rate_init': 0.005, 'max_iter': 300, 'solver': 'adam', 'validation_fraction': 0.2}
    mlp.set_params(**best_params)
    mlp.fit(X_train_scaled, y_train)
    y_pred_train = mlp.predict(X_train_scaled)
    y_pred_mlp = mlp.predict(X_test_scaled)
    report = classification_report(y_test, y_pred_mlp,output_dict=True ,zero_division=0.0)
    report_train = classification_report(y_train, y_pred_train,output_dict=True ,zero_division=0.0)
    for cls in classes:
          for m in metrics:
              report_accumulator[cls][m] += report.get(cls, {}).get(m, 0.0)

    for cls in classes:
      for m in metrics:
          report_train_accumulator[cls][m] += report_train.get(cls, {}).get(m, 0.0)


  report_avg = {
  cls: {m: report_accumulator[cls][m] / n for m in metrics}
  for cls in classes
}
  report_train_avg = {
      cls: {m: report_train_accumulator[cls][m] / n for m in metrics}
      for cls in classes
  }

  print(f"\nNumber of training samples:{len(X_train)}")
  print(f"Number of test samples:{len(X_test)}")
  print("MLPClassifier Test Mean Class Accuracy: \n ")
  for cls in classes:
    acc=f'class: {cls} , '
    for m in metrics:
        val = report_avg[cls][m]

        if m != 'support':
            acc=acc+f'{m}: {val:.2f} , '
        else:
            acc=acc+f'{m}: {int(val)}'
    print(acc)
  print("\n MLPClassifier Train Mean Class Accuracy: \n ")
  for cls in classes:
    acc=f'class: {cls} , '
    for m in metrics:
        val = report_train_avg[cls][m]

        if m != 'support':
            acc=acc+f'{m}: {val:.2f} , '
        else:
            acc=acc+f'{m}: {int(val)}'
    print(acc)
  """
  param_grid = {
        'solver':['adam','sgd','lbfgs'],
        'hidden_layer_sizes': [(30,),(50,), (100,)],  # Architetture da testare
        'activation': ['relu'],                  # Funzioni di attivazione
        'alpha': [0.0001, 0.001],                        # Termine di regolarizzazione
        'learning_rate_init': [0.01,0.001, 0.0001],
        'batch_size': [16,32,64],# Learning rate iniziale
        'learning_rate': [ 'adaptive','invscaling'],
        'early_stopping': [True]                         # Fermata anticipata
    }
  skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  # Inizializza il classificatore e la grid search
  mlp = MLPClassifier(random_state=42, max_iter=300)
  grid_search = GridSearchCV(
    estimator=mlp,
    param_grid=param_grid,
    cv=skf,  # Usa StratifiedKFold qui
    scoring='f1_weighted',  # o 'accuracy', 'roc_auc' se binario
    n_jobs=-1,
    verbose=2
)
  grid_search.fit(X_train_scaled, y_train)

  # Stampa i migliori parametri trovati
  print("Migliori parametri trovati:")
  print(grid_search.best_params_)

  # Valutazione sul test set con il modello ottimizzato
  best_mlp = grid_search.best_estimator_
  y_pred = best_mlp.predict(X_test_scaled)
  report = classification_report(y_test, y_pred, output_dict=True)

  # Stampa il report di classificazione
  print("\nClassification Report:")
  print(pd.DataFrame(report).transpose())
"""



def apply_decision_tree(X,X_train, X_test, y_train, y_test):

    continous_features=["THICKNESS_TULUS [mm]","CONTOUR_SPEED [mm/min]","LASER_POWER [W]","CONTOUR_GAS_PRESSURE [bar]","CONTOUR_NOZZLE_DISTANCE [mm]","CONTOUR_FOCAL [mm]","NOZZLE_SIZE"]
    classes = ['0', '1', '2', '3', '4']  # Sostituisci con le tue vere classi come stringhe
    metrics = ['precision', 'recall', 'f1-score', 'support']
    report_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
    report_train_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
    n=100
    for k in range(n):

      X_train, X_test, y_train, y_test=split_data(X,only_quality_cut=False)
      X_train=X_train.copy().drop(["DEFECT_TYPE"],axis=1)
      X_test=X_test.copy().drop(["DEFECT_TYPE"],axis=1)
      dt_model = DecisionTreeClassifier(random_state=42,ccp_alpha= 0.0005,class_weight= None,criterion= 'entropy',max_depth= 20,max_features= None,max_leaf_nodes= None,min_impurity_decrease= 0.0005,min_samples_leaf= 2,min_samples_split= 2,splitter= 'best')
      dt_model.fit(X_train, y_train)
      y_pred_train=dt_model.predict(X_train)
      y_pred_dt = dt_model.predict(X_test)
      report=classification_report(y_test, y_pred_dt, output_dict=True,zero_division=0.0)
      report_train = classification_report(y_train, y_pred_train, output_dict=True, zero_division=0.0)
      for cls in classes:
        for m in metrics:
            report_accumulator[cls][m] += report.get(cls, {}).get(m, 0.0)

      for cls in classes:
        for m in metrics:
            report_train_accumulator[cls][m] += report_train.get(cls, {}).get(m, 0.0)


    report_avg = {
    cls: {m: report_accumulator[cls][m] / n for m in metrics}
    for cls in classes
}
    report_train_avg = {
        cls: {m: report_train_accumulator[cls][m] / n for m in metrics}
        for cls in classes
    }

    print(f"\nNumber of training samples:{len(X_train)}")
    print(f"Number of test samples:{len(X_test)}")
    print("Decision Tree Test Mean Class Accuracy: \n ")
    for cls in classes:
      acc=f'class: {cls} , '
      for m in metrics:
          val = report_avg[cls][m]

          if m != 'support':
              acc=acc+f'{m}: {val:.2f} , '
          else:
              acc=acc+f'{m}: {int(val)}'
      print(acc)
    print("\n Decision Tree Train Mean Class Accuracy: \n ")
    for cls in classes:
      acc=f'class: {cls} , '
      for m in metrics:
          val = report_train_avg[cls][m]

          if m != 'support':
              acc=acc+f'{m}: {val:.2f} , '
          else:
              acc=acc+f'{m}: {int(val)}'
      print(acc)



def apply_random_forest(X, X_train, X_test, y_train, y_test):

    continous_features=["THICKNESS_TULUS [mm]","CONTOUR_SPEED [mm/min]","LASER_POWER [W]","CONTOUR_GAS_PRESSURE [bar]","CONTOUR_NOZZLE_DISTANCE [mm]","CONTOUR_FOCAL [mm]"]
    classes = ['0', '1', '2', '3', '4']
    metrics = ['precision', 'recall', 'f1-score', 'support']
    report_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
    report_train_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
    """ 
    param_grid = {
      'n_estimators': [50,100],
      'criterion': ['entropy', 'log_loss'],
      'max_depth': [None,10,40],
      'min_samples_split': [2,5],
      'min_samples_leaf': [2,4],
      'max_features': ['sqrt', 'log2', None],
      'bootstrap': [True, False],
      'class_weight': [None, 'balanced'],
      'max_leaf_nodes': [None],
      'min_impurity_decrease': [0.0,0.05],
      'ccp_alpha': [0.0,0.0005]
        }
    
    rf_model = RandomForestClassifier(n_estimators=10)
    grid_search = GridSearchCV(estimator=rf_model,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_weighted',
                           n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print("Miglior parametri:", grid_search.best_params_)
    print("Miglior punteggio F1:", grid_search.best_score_)
    print("Miglior punteggio F1 sul test set:", grid_search.score(X_test, y_test))
    """
    n=100
    for k in range(n):

      X_train, X_test, y_train, y_test=split_data(X,only_quality_cut=False)
      X_train=X_train.copy().drop(["DEFECT_TYPE"],axis=1)
      X_test=X_test.copy().drop(["DEFECT_TYPE"],axis=1)
      """
      rf_model = RandomForestClassifier(n_estimators= 100,criterion= 'entropy',bootstrap= True, ccp_alpha=0.0005, class_weight= None, max_depth= 20,
                                        max_features= 'log2', max_leaf_nodes= None, min_impurity_decrease= 0.0005,
                                        min_samples_leaf= 2, min_samples_split= 2 )
      """
      rf_model = RandomForestClassifier(
          bootstrap=True,
          ccp_alpha=0.0,
          class_weight=None,
          criterion='entropy',
          max_depth=40,
          max_features='sqrt',
          max_leaf_nodes=None,
          min_impurity_decrease=0.0,
          min_samples_leaf=2,
          min_samples_split=5,
          n_estimators=50
      )
      rf_model.fit(X_train, y_train)

      y_pred_train = rf_model.predict(X_train)
      y_pred_dt = rf_model.predict(X_test)
      report = classification_report(y_test, y_pred_dt, output_dict=True, zero_division=0.0)
      report_train = classification_report(y_train, y_pred_train, output_dict=True, zero_division=0.0)
      for cls in classes:
          for m in metrics:
              report_accumulator[cls][m] += report.get(cls, {}).get(m, 0.0)

      for cls in classes:
          for m in metrics:
              report_train_accumulator[cls][m] += report_train.get(cls, {}).get(m, 0.0)

    report_avg = {
        cls: {m: report_accumulator[cls][m] / n for m in metrics}
        for cls in classes
    }
    report_train_avg = {
        cls: {m: report_train_accumulator[cls][m] / n for m in metrics}
        for cls in classes
    }


    print(f"\nNumber of training samples:{len(X_train)}")
    print(f"Number of test samples:{len(X_test)}")
    print("Random Forest Test Mean Class Accuracy: \n ")
    for cls in classes:
      acc=f'class: {cls} , '
      for m in metrics:
          val = report_avg[cls][m]

          if m != 'support':
              acc=acc+f'{m}: {val:.2f} , '
          else:
              acc=acc+f'{m}: {int(val)}'
      print(acc)
    print("\nRandom Forest Train Mean Class Accuracy: \n ")
    for cls in classes:
      acc=f'class: {cls} , '
      for m in metrics:
          val = report_train_avg[cls][m]

          if m != 'support':
              acc=acc+f'{m}: {val:.2f} , '
          else:
              acc=acc+f'{m}: {int(val)}'
      print(acc)

def apply_KNN(X,X_train, X_test, y_train, y_test):

    continous_features=["THICKNESS_TULUS [mm]","CONTOUR_SPEED [mm/min]","LASER_POWER [W]","CONTOUR_GAS_PRESSURE [bar]","CONTOUR_NOZZLE_DISTANCE [mm]","CONTOUR_FOCAL [mm]"]
    classes = ['0', '1', '2', '3', '4']  # Sostituisci con le tue vere classi come stringhe
    metrics = ['precision', 'recall', 'f1-score', 'support']
    report_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
    print(f"\nNumber of training samples:{len(X_train)}")
    print(f"Number of test samples:{len(X_test)}")
    print("KNN Mean Class Accuracy: \n ")

   

    n=100
    for i in range(n):
      X_train, X_test, y_train, y_test = split_data(X, only_quality_cut=False)
      X_train = X_train.copy().drop(["DEFECT_TYPE"], axis=1)
      X_test = X_test.copy().drop(["DEFECT_TYPE"], axis=1)
      scaler = StandardScaler()  # Oppure MinMaxScaler()
      scaled_continuous_train = pd.DataFrame(scaler.fit_transform(X_train),index=X_train.index)
      scaled_continuous_test = pd.DataFrame(scaler.transform(X_test),index=X_test.index)
      knn_model = KNeighborsClassifier(algorithm= 'auto', leaf_size= 30, metric= 'manhattan', metric_params= None, n_neighbors= 9, p= 2, weights= 'uniform')
      knn_model.fit(scaled_continuous_train, y_train)
      y_pred = knn_model.predict(scaled_continuous_test)
      report=classification_report(y_test, y_pred, output_dict=True,zero_division=0.0)
      for cls in classes:
        for m in metrics:
            report_accumulator[cls][m] += report.get(cls, {}).get(m, 0.0)


    report_avg = {
    cls: {m: report_accumulator[cls][m] / n for m in metrics}
    for cls in classes
}

    for cls in classes:
      acc=f'class: {cls} , '
      for m in metrics:
          val = report_avg[cls][m]

          if m != 'support':
              acc=acc+f'{m}: {val:.2f} , '
          else:
              acc=acc+f'{m}: {int(val)}'
      print(acc)



def linear_svm(X_train, X_test, y_train, y_test):

  continous_features=["THICKNESS_TULUS [mm]","CONTOUR_SPEED [mm/min]","LASER_POWER [W]","CONTOUR_GAS_PRESSURE [bar]","CONTOUR_NOZZLE_DISTANCE [mm]","CONTOUR_FOCAL [mm]"]
    # Definizione del modello
  classes = ['0', '1', '2', '3', '4']
  # Definizione della griglia dei parametri
  param_grid = {
      'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
      'class_weight': [None, 'balanced'],
      'max_iter': [1000, 5000, 10000, -1],  # -1 significa "nessun limite"
      'tol': [1e-4, 1e-3, 1e-2]
  }
  best_accuracy=0.0
  best_params = []
  param_combo=ParameterGrid(param_grid)
  for param in tqdm(param_combo):
      svc = SVC(kernel='linear')
      svc.set_params(**param)
      scaler = StandardScaler()  # Oppure MinMaxScaler()
      scaled_continuous_train = pd.DataFrame(scaler.fit_transform(X_train[continous_features]), columns=continous_features,index=X_train.index)

      non_continuous_train = X_train[[col for col in X_train.columns if col not in continous_features]]
      X_train_scaled = pd.concat([scaled_continuous_train, non_continuous_train], axis=1)
    # X_train_scaled["cluster"]=clusters
      scaled_continuous_test = pd.DataFrame(scaler.transform(X_test[continous_features]), columns=continous_features,index=X_test.index)
      non_continuous_test = X_test[[col for col in X_test.columns if col not in continous_features]]
      X_test_scaled = pd.concat([scaled_continuous_test, non_continuous_test], axis=1)
      svc.fit(X_train_scaled,y_train)
      y_pred=svc.predict(X_test_scaled)
      report = classification_report(y_test, y_pred, output_dict=True, zero_division=0.0)
      scores=[]
      for cls in classes:
              scores.append(report.get(cls, {}).get('f1-score', 0.0))
      score=np.mean(scores)
      print(score)
      if score > best_accuracy:
        best_accuracy = score
        best_params=svc.get_params()

  print(f'best params: {best_params}\n')
  print(f'best score: {best_accuracy}')







def apply_SVM(X_train, X_test, y_train, y_test):

    ker=["linear","poly"]
    #Best params: {'linear': {'C': 100}, 'poly': {'C': 1, 'class_weight': None, 'coef0': 1, 'degree': 6, 'gamma': 'scale'}}
    continous_features=["THICKNESS_TULUS [mm]","CONTOUR_SPEED [mm/min]","LASER_POWER [W]","CONTOUR_GAS_PRESSURE [bar]","CONTOUR_NOZZLE_DISTANCE [mm]","CONTOUR_FOCAL [mm]"]
    print(f"\nNumber of training samples:{len(X_train)}")
    print(f"Number of test samples:{len(X_test)}")
    print("SVM Mean Class Accuracy: \n")
    for k in ker:
      classes = ['0', '1', '2', '3', '4']
      metrics = ['precision', 'recall', 'f1-score', 'support']
      report_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
      report_train_accumulator = {cls: {m: 0.0 for m in metrics} for cls in classes}
      n=100
      for i in range(n):
        scaler = StandardScaler()  # Oppure MinMaxScaler()
        scaled_continuous_train = pd.DataFrame(scaler.fit_transform(X_train[continous_features]), columns=continous_features,index=X_train.index)
        gmm = GaussianMixture(n_components=15, random_state=42)
        gmm.fit(scaled_continuous_train)
        # Prevediamo a quale cluster appartiene ogni punto
        cluster_labels = gmm.predict(scaled_continuous_train)
        non_continuous_train = X_train[[col for col in X_train.columns if col not in continous_features]]
        X_train_scaled = pd.concat([scaled_continuous_train, non_continuous_train], axis=1)
        clusters=pd.DataFrame(np.array(cluster_labels).reshape(-1,1),columns=["cluster"],index=X_train.index)
      # X_train_scaled["cluster"]=clusters
        scaled_continuous_test = pd.DataFrame(scaler.transform(X_test[continous_features]), columns=continous_features,index=X_test.index)
        cluster_labels=gmm.predict(scaled_continuous_test)
        clusters=pd.DataFrame(np.array(cluster_labels).reshape(-1,1),columns=["cluster"],index=X_test.index)
        non_continuous_test = X_test[[col for col in X_test.columns if col not in continous_features]]
        X_test_scaled = pd.concat([scaled_continuous_test, non_continuous_test], axis=1)
        #X_test_scaled["cluster"]=clusters
        svm_model = SVC(kernel=k, C=100, class_weight=None, coef0=1, degree=6, gamma='scale') if k == "poly" else SVC(kernel=k, C=100)
        svm_model.fit(X_train_scaled, y_train)
        y_pred = svm_model.predict(X_test_scaled)
        y_pred_train = svm_model.predict(X_train_scaled)
        report_train = classification_report(y_train, y_pred_train, output_dict=True, zero_division=0.0)
        report=classification_report(y_test, y_pred, output_dict=True,zero_division=0.0)
        for cls in classes:
          for m in metrics:
              report_accumulator[cls][m] += report.get(cls, {}).get(m, 0.0)
        for cls in classes:
          for m in metrics:
              report_train_accumulator[cls][m] += report_train.get(cls, {}).get(m, 0.0)

      report_avg = {
      cls: {m: report_accumulator[cls][m] / n for m in metrics}
      for cls in classes
  }
      report_train_avg = {
          cls: {m: report_train_accumulator[cls][m] / n for m in metrics}
          for cls in classes
      }

      print("SVM Test Mean Class Accuracy: \n ")
      print(f" kernel: {k}\n")
      for cls in classes:
        acc=f'class: {cls} , '
        for m in metrics:
            val = report_avg[cls][m]

            if m != 'support':
                acc=acc+f'{m}: {val:.2f} , '
            else:
                acc=acc+f'{m}: {int(val)}'
        print(acc)
      print("\n SVM Train Mean Class Accuracy: \n ")
      for cls in classes:
        acc=f'class: {cls} , '
        for m in metrics:
            val = report_train_avg[cls][m]

            if m != 'support':
                acc=acc+f'{m}: {val:.2f} , '
            else:
                acc=acc+f'{m}: {int(val)}'
        print(acc)



final_data2=pd.read_excel('outlier_detection_output/final_data_cleaned.xlsx')
#final_data2=pd.read_excel("merged_files.xlsx").drop(columns=["TECHNOLOGY_GAS",'CONTOUR_LASER_MODE','LASER_TYPE','QUALITY_CUT'])

print(f"final_data2 length: {len(final_data2)}")
final_data2=final_data2.drop_duplicates(subset=[col for col in final_data2.columns if col not in ['DEFECT_TYPE', 'QUALITY_CUT']], keep='first', inplace=False).reset_index(drop=True)
print(f"final_data2 length after cleaning: {len(final_data2)}")
print(final_data2.columns)
encoded_data=flatten_categorical_columns(final_data2,["NOZZLE_TYPE","MATERIAL_NAME_TULUS"],only_quality_cut=False)
print(encoded_data.columns)



X_train, X_test, y_train, y_test=split_data(encoded_data,only_quality_cut=False)
print(f'Train classes: \n {X_train["DEFECT_TYPE"].value_counts()}')
print(f'\n Test classes: \n {X_test["DEFECT_TYPE"].value_counts()}')
X_train=X_train.drop(columns=["DEFECT_TYPE"],axis=1)
X_test=X_test.drop(columns=["DEFECT_TYPE"],axis=1)
#catboost_classifier(final_data2)
#mlp_classifier(encoded_data,X_train, X_test, y_train, y_test)
#apply_decision_tree(encoded_data,X_train, X_test, y_train, y_test)
#apply_random_forest(encoded_data,X_train, X_test, y_train, y_test)
#apply_SVM(X_train, X_test, y_train, y_test)
apply_KNN(encoded_data,X_train, X_test, y_train, y_test)
#linear_svm(X_train, X_test, y_train, y_test)




