import numpy as np
import pandas as pd
import copy
import time


class AdamOptimizer:
  def __init__(self, learning_rate, momentum = 0.9, sum_squared_weight=0.9):
    self.learning_rate = learning_rate
    self.momentum = momentum
    self.sum_squared_weight = sum_squared_weight

  def initialize_parameters(self, parameters):
    self.timestep = 1
    self.weighted_sum_of_derivatives = {}
    self.weighted_sum_of_squared_derivatives = {}

    for param, value in parameters.items():
      self.weighted_sum_of_derivatives[param] = np.zeros(value.shape)
      self.weighted_sum_of_squared_derivatives[param] = np.zeros(value.shape)

  def calculate_updates(self, gradients):
    updates = {}
    for param, gradient in gradients.items():
      self.weighted_sum_of_derivatives[param] = self.momentum * self.weighted_sum_of_derivatives[param] + (1-self.momentum) * gradient
      self.weighted_sum_of_squared_derivatives[param] = self.sum_squared_weight * self.weighted_sum_of_squared_derivatives[param] + (1-self.sum_squared_weight) * np.power(gradient, 2)
      
      corrected_weighted_sum_of_derivatives = self.weighted_sum_of_derivatives[param] / (1-np.power(self.momentum, self.timestep))
      corrected_weighted_sum_of_squared_derivatives = self.weighted_sum_of_squared_derivatives[param] / (1-np.power(self.sum_squared_weight, self.timestep))

      tolerance=1.0e-8 # to avoid division by zero
      updates[param] = - self.learning_rate * corrected_weighted_sum_of_derivatives / (np.power(corrected_weighted_sum_of_squared_derivatives, 0.5) + tolerance) 
    self.timestep += 1
    return updates


class Multi_Layer_Perceptron:

  def __init__(self, layer_sizes):
    self.layer_sizes = layer_sizes

  def loss(self, y, y_hat):
    # loss function: sum squared error divided by 2.
    # Thus the detivative will be just the sum error.
    return 0.5*np.sum(np.multiply(y-y_hat, y-y_hat), axis=1)

  def activation_function(self,type,z):
    if   type == 'sigmoid':
         return 1 / (1 + np.exp(-z))
    elif type == 'tanh':
         return np.tanh(z)
    elif type == 'relu':
         return np.maximum(0, z)
    elif type == 'lin':
        return z
    elif type == 'softmax':
        exp_x = np.exp(z - np.max(z, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
    elif type == 'step':
        return np.where(z >= 0, 1, 0)
    elif type == 'sine':  
        return np.sin(z)
    else:
        raise ValueError("Invalid activation function type")
    
  def activation_function_derivative(self,type,z):
    if type == 'sigmoid':
      s = self.activation_function('sigmoid',z)
      return s * (1 - s)
    elif type == 'tanh':
      return 1 - np.tanh(z)**2
    elif type == 'relu':
      return np.where(z > 0, 1, 0)
    elif type == 'lin':
      return np.ones_like(z)
    elif type == 'softmax':
      s = self.activation_function('softmax',z)
      return s * (1 - s)
    elif type == 'step':
       return np.zeros_like(z)
    elif type == 'sine':  
        return np.cos(z)
    else:
      raise ValueError("Invalid activation function type")

  def initialize_parameters(self, input_layer_size):
    self.parameters = {}

    previous_layer_size = input_layer_size

    for layer_index, layer_size in enumerate(self.layer_sizes, start=1):
      self.parameters[f"W{layer_index}"] = np.random.randn(previous_layer_size, layer_size) * 0.1
      self.parameters[f"b{layer_index}"] = np.zeros((1, layer_size))
      previous_layer_size = layer_size

  def forward_propagation(self, X):
    node_values = {"H0" : X}
    # matrix products from the input layer to the output layer
    for layer_index in range(1, len(self.layer_sizes)+1):
      node_values[f"Z{layer_index}"] = np.dot(node_values[f"H{layer_index-1}"],self.parameters[f"W{layer_index}"]) + self.parameters[f"b{layer_index}"]
      node_values[f"H{layer_index}"] = self.activation_function(type[layer_index-1],node_values[f"Z{layer_index}"])

    return node_values

  def backpropagation(self, node_values, Y):
    num_layers = len(self.layer_sizes)
    Y_hat = node_values[f"H{num_layers}"]
    d = {}

    # Output layer
    # derivative of the loss function times the activation output layer derivative
    d[f"dL/dZ{num_layers}"] = (Y_hat - Y) * self.activation_function_derivative(type[-1],node_values[f"Z{num_layers}"])

    # Hidden layers: from the last one to the first one
    for l in range(num_layers, 0, -1):
      d[f"dL/dW{l}"] = np.dot(node_values[f"H{l-1}"].T, d[f"dL/dZ{l}"])
      d[f"dL/db{l}"] = np.sum(d[f"dL/dZ{l}"], axis=0, keepdims=True)
      
      if l > 1:
        d[f"dL/dH{l-1}"] = np.dot(d[f"dL/dZ{l}"], self.parameters[f"W{l}"].T)
        d[f"dL/dZ{l-1}"] = np.multiply(d[f"dL/dH{l-1}"], self.activation_function_derivative(type[l-2],node_values[f"Z{l-1}"]))

    # effective length of the batch (the last one may vary)
    m = len(Y) 
    parameter_gradients = {}

    # mean of the gradients of the current batch
    for l in range(1, num_layers + 1):
      parameter_gradients[f"W{l}"] = 1/m * d[f"dL/dW{l}"]
      parameter_gradients[f"b{l}"] = 1/m * d[f"dL/db{l}"]   

    return parameter_gradients   

  def apply_updates(self, updates):
    for param, value in updates.items():
      self.parameters[param] += value

  def fit(self, X_train, Y_train, X_validate, Y_validate, max_iterations, optimizer, batch_size):     
    self.initialize_parameters(X.shape[1])
    optimizer.initialize_parameters(self.parameters)
    num_layers = len(self.layer_sizes)

    # min_val_loss initialized equal to 1 as the data got normalized
    min_val_loss = 1 
    losses = []
    Y_train_indices = Y_train
    Y_validate_indices = Y_validate

    for iteration in range(max_iterations):
      training_accuracies = []

      for batch_start in range(0, len(Y_train), batch_size):
        batch_end = min(batch_start + batch_size, len(Y_train))

        X_batch = X_train[batch_start:batch_end]
        Y_batch = Y_train[batch_start:batch_end]
        Y_batch_indices = Y_train_indices[batch_start:batch_end]
        node_values = self.forward_propagation(X_batch)
        gradients = self.backpropagation(node_values, Y_batch)

        updates = optimizer.calculate_updates(gradients)
        self.apply_updates(updates)
   
        train_preds = node_values[f"H{num_layers}"]
        batch_accuracy = np.sum(self.loss(Y_batch_indices,train_preds))/len(Y_batch_indices)
        training_accuracies.append(batch_accuracy)
      
      train_loss = np.mean(training_accuracies)
      
      val_preds = self.predict(X_validate)
      
      val_loss = np.sum(self.loss(Y_validate_indices,val_preds))/len(Y_validate_indices)
      
       
      
      
      
      if (iteration+1 < 25 or (iteration+1 >= 25 and iteration+1 < 50 and (iteration+1) % 5 == 0) or (iteration+1 >= 50 and iteration+1 < 100 and (iteration+1) % 10 == 0) or (iteration+1 >= 100 and iteration+1 < 500 and (iteration+1) % 25 == 0) or (iteration+1 >= 500 and iteration+1 < 1000 and (iteration+1) % 50 == 0) or (iteration+1 >= 1000 and (iteration+1) % 100 == 0)) or val_loss < min_val_loss:
        file=open('history_losses.txt', 'a')
        
        result_string = f"epoch: {iteration+1:4d}, train_loss: {train_loss:.4f}, val_loss: {val_loss:.4f}"
        file.write(result_string)
        

      losses.append((train_loss, val_loss))
      
      if val_loss < min_val_loss:
        min_val_loss = val_loss
        best_parameters = copy.deepcopy(self.parameters)
        
        
        file.write(f" <-- NEW BEST SAVED!\n")
        file.close()
      else:
        if iteration+1 < 25 or (iteration+1 >= 25 and iteration+1 < 50 and (iteration+1) % 5 == 0) or (iteration+1 >= 50 and iteration+1 < 100 and (iteration+1) % 10 == 0) or (iteration+1 >= 100 and iteration+1 < 500 and (iteration+1) % 25 == 0) or (iteration+1 >= 500 and iteration+1 < 1000 and (iteration+1) % 50 == 0) or (iteration+1 >= 1000 and (iteration+1) % 100 == 0):

          
          file.write("\n")
          file.close()
        
    self.parameters = best_parameters
    return losses

  def predict(self, X):
    node_values = self.forward_propagation(X)
    
    num_layers = len(self.layer_sizes)
    return node_values[f"H{num_layers}"]

def preprocessing(mode1,dataset):
    if mode1 == 'normalize':
      # MIN-MAX normalization
      # Extracting features to normalize.
      maxes = np.max(np.abs(dataset), axis=0)
     
      # Replace zero values with 1 to avoid division by zero later
      maxes[maxes == 0] = 1

      # Normalizing to improve MLP performance
      dataset_scaled = dataset/maxes

      return dataset_scaled, maxes
    # elif mode == 'standardize':


       ### standardize code ###
    else:
      raise ValueError("Invalid preprocessing")


#_____________________________________#
# ============ MAIN CODE ============ #
type=activation_functions
start_time = time.time()
MLP_struct = [int(x) for x in MLP_shape[1:]]
model = Multi_Layer_Perceptron(MLP_struct)


# Training mode
if mode == "train":

    # dataset file get loaded.
    td           =   pd.read_csv(dataset_file_name)
    way = "normalize"
    td_preprocessed, maxes = preprocessing(way,td)
    # dataset rows get shuffled.
    shuffled_td  =   td_preprocessed.sample(frac=percentage,random_state=seed)
    # Acquiring data about input and output sizes.
    Input_Size   =   MLP_shape[0]   
    Output_Size  =   MLP_shape[-1] 
    print("Review of the given specification:")
    print(f"- Input  layer size: {Input_Size}")
    print(f"- Output layer size: {Output_Size}")
    print(f"- Number of columns dataset provided: {len(shuffled_td.columns)}")

    # Consistency check between simulink-provided I/O sizes and actual dataset number of columns. 
    if len(shuffled_td.columns) == Input_Size + Output_Size :
      print("- Input and Output layer sizes are consistent.")
      print("- Hidden Layers Activation Function:", end=" '")
      for func in activation_functions[:-2]:
          print(func, end="', '")
      print(f"{activation_functions[-2]}'")
      print(f"- Output Activation Function: '{activation_functions[-1]}'")
      print('- MLP overall shape: ', MLP_shape)
      print() 
      print("---------- MLP: Training and Validation ----------")

      # Splitting whole data columns into input (X) and output (Y) columns. 
      X = shuffled_td.iloc[:, :Input_Size].values
      Y = shuffled_td.iloc[:, -Output_Size:].values

      # Splitting whole data rows into training (80%) and validation (20%) data.
      num_training    =   int(len(shuffled_td)*.8)
      training_td     =   pd.DataFrame(shuffled_td[:num_training])
      validation_td   =   pd.DataFrame(shuffled_td[num_training:])
      # training data   - 20% of the whole set
      X_train_scaled = training_td.iloc[:, :Input_Size].values
      Y_train_scaled = training_td.iloc[:, -Output_Size:].values
      # validation data - 80% of the whole set
      X_validation_scaled = validation_td.iloc[:, :Input_Size].values
      Y_validation_scaled = validation_td.iloc[:, -Output_Size:].values
      # Applying the 80/20 rule: The 80/20 rule is commonly accepted in the literature. 
      # When datasets are much larger, the size of the validation set can be decreased.

      # Extracting normalization features to normalize.
  

      # Optimizer set-up
      opt = AdamOptimizer(learning_rate,
                          momentum=momentum,
                          sum_squared_weight=sum_squared_weight)
      # Training set-up
      losses =  model.fit(X_train_scaled,
                           Y_train_scaled,
                             X_validation_scaled,
                               Y_validation_scaled,
                                 max_epoch_N,
                                   opt,
                                     batch_size)
      
      print("- Training and validation successfully terminated -")

      # Saving final parameters into a dictionary
      final_MLP=model.parameters 

      # Evaluating computational efficienty
      elapsed_time = time.time() - start_time
      if elapsed_time < 10:
          elapsed_time_str = f'{elapsed_time * 1000:.0f} msec'
      else:
          elapsed_time_str = f'{elapsed_time:.2f} sec'
          print(f"End of training.\n"
            f"Elapsed time: {elapsed_time_str}\n"
            f"Number of epochs: {max_epoch_N}\n"
            f"Average elapsed time for each epoch: {elapsed_time/max_epoch_N* 1000 :.3f} msec")
    
    # Non-concistency branch
    else:
      print("Error: Dataset or Input/Output layer are not consistent.")  
      final_MLP = False
    

# Deployment mode    
elif mode == "deploy":
    print("MLP deployment (Python)...")
    x_scaled = p/maxes_X
    
    model.parameters=parameters
    y_scaled=model.predict(x_scaled)
    y=y_scaled*maxes_Y
    
    # Set print options for NumPy array
    #np.set_printoptions(precision=4, suppress=True, floatmode='fixed')

    # Print the value of y
    print("y_py =")
    print(np.array2string(y, formatter={'float_kind': lambda x: "%.4f" % x}))
    
    # Evaluating computational efficienty
    elapsed_time = time.time() - start_time
    if elapsed_time < 10:
        elapsed_time_str = f'{elapsed_time * 1000:.4f} msec'
    else:
        elapsed_time_str = f'{elapsed_time:.4f} sec'
    print(f'Elapsed time python file call: {elapsed_time_str}')
    print(f'Number of Input processed: {len(y)}')
    print(f'Average elapsed time for each Input: {elapsed_time * 1000 / len(y):.7f} msec')
    print('-------------------------------------------------')
    
else:
    print("Error: neither 'creation' nor 'building' mode are correctly typed")