from matplotlib import pyplot as plt
from xgboost import XGBRegressor
import random
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans,DBSCAN
from sklearn.metrics import silhouette_score
from kmodes.kprototypes import KPrototypes
from scipy.spatial.distance import pdist
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
import category_encoders as ce
from utils import flatten_categorical_columns, reverse_get_dummies
from sklearn.ensemble import IsolationForest
import plotly.express as px
import plotly.graph_objects as go

cont_features = ["THICKNESS_TULUS [mm]", "CONTOUR_SPEED [mm/min]", "LASER_POWER [W]",
                          "CONTOUR_GAS_PRESSURE [bar]", 'CONTOUR_NOZZLE_DISTANCE [mm]', "CONTOUR_FOCAL [mm]"]
final_data=pd.read_excel('final_greedy_all_defects.xlsx')
real_data=pd.read_excel("merged_files.xlsx")
real_data=real_data.drop(columns=['LASER_TYPE','TECHNOLOGY_GAS','CONTOUR_LASER_MODE','QUALITY_CUT'])

final_data=final_data[real_data.columns]
final_data['origin']='syn'
real_data['origin']='real'
final_data=pd.concat([real_data.copy(),final_data.copy()],axis=0,ignore_index=True)

print(f"final_data2 length: {len(final_data)}")
final_data=final_data.drop_duplicates(subset=[col for col in final_data.columns if col not in ['DEFECT_TYPE', 'QUALITY_CUT']], keep='first', inplace=False).reset_index(drop=True)
print(f"final_data2 length after cleaning: {len(final_data)}")
final_data_copy=final_data.copy()

   
print(final_data.columns)
"""
for col in cont_features:
    grouped = final_data.groupby('NOZZLE_TYPE')[cont_features].mean()
    sns.boxplot(x='NOZZLE_TYPE', y=col, data=final_data)
    plt.xticks(rotation=45)
    plt.title(f"Distribuzione di {col} per tipo di ugello (NOZZLE_TYPE)")
    plt.show()
"""

defect_contaminations={
    'Burr':0.50,
    'No Defects':0.40,
    'Plasma':0.20,
    'Cutting torn':0.20,
    'Cutting loss':0.20
}
final_data_cleaned = pd.DataFrame()
cat_cols = ['NOZZLE_TYPE']
final_data=final_data[cont_features + ['NOZZLE_TYPE','MATERIAL_NAME_TULUS','DEFECT_TYPE','origin']].copy()
new_cont_features=[col for col in cont_features  if col!= 'THICKNESS_TULUS [mm]']
for defect in final_data['DEFECT_TYPE'].unique():

    materials_thick = {
    material: sorted(
        final_data.loc[
            (final_data['DEFECT_TYPE'] == defect) &
            (final_data["MATERIAL_NAME_TULUS"] == material), 
            "THICKNESS_TULUS [mm]"].dropna().unique().tolist()) for material in final_data.loc[final_data['DEFECT_TYPE'] == defect, "MATERIAL_NAME_TULUS"].unique()}
    

    for material in materials_thick.keys():
        for thickness in materials_thick[material]:    
            
            final_data2=final_data[(final_data["DEFECT_TYPE"]==defect) & (final_data['MATERIAL_NAME_TULUS']==material) & (final_data['THICKNESS_TULUS [mm]']==thickness)].copy()
            df_cleaned=pd.DataFrame()
            
            if final_data2.shape[0] >= 5:
                old_final_data2 = final_data2.copy().reset_index(drop=True)
                for nozzle in final_data2['NOZZLE_TYPE'].unique():
                    scaler=StandardScaler()
                    subset = final_data2[final_data2['NOZZLE_TYPE'] == nozzle].copy()

                    if len(subset) >= 2:  # aggiunto filtro minimo per evitare crash su subset piccoli
                        subset_scaled = subset.copy()
                        subset_scaled[new_cont_features] = scaler.fit_transform(subset[new_cont_features])

                        iso_forest = IsolationForest(
                            contamination=defect_contaminations[defect],  # Imposta la contaminazione al 25%
                            random_state=42,
                            n_estimators=100
                        )
                        labels = iso_forest.fit_predict(subset_scaled[new_cont_features])
                        subset_clean = subset[(labels != -1) | (subset['origin']=='real')].copy()

                        if df_cleaned.empty:
                            df_cleaned = subset_clean.copy()
                        else:
                            df_cleaned = pd.concat([df_cleaned, subset_clean], ignore_index=True)

                final_data2 = df_cleaned.copy()
                #print(f"final_data2 ({defect}) shape after outliers: {final_data2.shape}, mean: {final_data2[new_cont_features].mean()} , std: {final_data2[new_cont_features].std()}")
                #print(f"final_data2 ({defect}) shape before outliers: {old_final_data2.shape}, mean: {old_final_data2[new_cont_features].mean()} , std: {old_final_data2[new_cont_features].std()}")
                
            else:
                old_final_data2 = final_data2.copy()
                final_data2 = final_data2.copy().drop(final_data2[final_data2['origin'] == 'syn'].index)
            
            final_data2=final_data2.drop(columns=['origin']).reset_index(drop=True)

            scaler2= StandardScaler()
            old_final_data2[new_cont_features] = scaler2.fit_transform(old_final_data2[new_cont_features])

            final_data2_scaled = final_data2.copy()
            final_data2_scaled[new_cont_features] = scaler2.transform(final_data2_scaled[new_cont_features])
            if len(final_data2)>=2:
                # PCA
                pca = PCA(n_components=3, random_state=42)
                print(f'Processing defect: {defect}, material: {material}, thickness: {thickness}')
                print(f'old_final_data2: shape: {old_final_data2.shape}, nozzles:{old_final_data2["NOZZLE_TYPE"].unique()}, final_data2_scaled: shape: {final_data2.shape}, nozzles:{final_data2["NOZZLE_TYPE"].unique()}')
                # Trasformazioni
                X_pca1 = pca.fit_transform(old_final_data2[new_cont_features])
                X_pca2 = pca.transform(final_data2_scaled[new_cont_features])

                # Mappatura nozzle → colori
                nozzle_types1 = old_final_data2['NOZZLE_TYPE'].astype(str).unique()
                nozzle_types2 = final_data2_scaled['NOZZLE_TYPE'].astype(str).unique()
                color_palette = px.colors.qualitative.Set1
                color_map1 = {str(ntype): color_palette[i % len(color_palette)] for i, ntype in enumerate(nozzle_types1)}
                color_map2 = {str(ntype): color_palette[i % len(color_palette)] for i, ntype in enumerate(nozzle_types2)}
                #print(f'Color maps created for nozzle types: {color_map1}, {color_map2}')
                # Dataframe per scatter
                df1 = pd.DataFrame(X_pca1, columns=['PCA1', 'PCA2', 'PCA3'])
                df1['NOZZLE_TYPE'] = old_final_data2['NOZZLE_TYPE'].astype(str)
                df1['COLOR'] = df1['NOZZLE_TYPE'].map(color_map1)
                df1['Group'] = 'old data with outliers'

                df2 = pd.DataFrame(X_pca2, columns=['PCA1', 'PCA2', 'PCA3'])
                df2['NOZZLE_TYPE'] = final_data2_scaled['NOZZLE_TYPE'].astype(str)
                df2['COLOR'] = df2['NOZZLE_TYPE'].map(color_map2)
                df2['Group'] = 'new data without outliers'
                # Plotly figure
                fig = go.Figure()

                # Scatter con outliers
                fig.add_trace(go.Scatter3d(
                    x=df1['PCA1'], y=df1['PCA2'], z=df1['PCA3'],
                    mode='markers',
                    marker=dict(size=6, color=df1['COLOR'], opacity=0.5, symbol='circle', line=dict(width=1, color='black')),
                    name='old data with outliers',
                    text=df1['NOZZLE_TYPE'],
                    hovertemplate='NOZZLE_TYPE: %{text}<br>PCA1: %{x:.2f}<br>PCA2: %{y:.2f}<br>PCA3: %{z:.2f}<extra></extra>'
                ))

                # Scatter senza outliers
                fig.add_trace(go.Scatter3d(
                    x=df2['PCA1'], y=df2['PCA2'], z=df2['PCA3'],
                    mode='markers',
                    marker=dict(size=6, color=df2['COLOR'], opacity=0.8, symbol='diamond', line=dict(width=1, color='black')),
                    name='new data without outliers',
                    text=df2['NOZZLE_TYPE'],
                    hovertemplate='NOZZLE_TYPE: %{text}<br>PCA1: %{x:.2f}<br>PCA2: %{y:.2f}<br>PCA3: %{z:.2f}<extra></extra>'
                ))

                # Layout
                fig.update_layout(
                    scene=dict(
                        xaxis_title='PCA 1',
                        yaxis_title='PCA 2',
                        zaxis_title='PCA 3'
                    ),
                    title=f'{defect} PCA 3D Plot for {material} at {thickness} mm',
                    legend_title_text='Group'
                )

                # Salvataggio HTML interattivo
                fig.write_html(f'outlier_detection_output/pca_3d_{defect}_{material}_{thickness}.html')

                # Aggiorna dataset finale
                if final_data_cleaned.empty:
                    final_data_cleaned = final_data2.copy()
                else:
                    final_data_cleaned = pd.concat([final_data_cleaned, final_data2.copy()], axis=0, ignore_index=True)
          
            # Esportazione
final_data_cleaned.to_excel('outlier_detection_output/final_data_cleaned.xlsx', index=False)