From e902ae06d5bd7dbf8d59687fc40fd06ea449827c Mon Sep 17 00:00:00 2001
From: Javier Valdes <javier.valdes@th-deg.de>
Date: Mon, 17 Mar 2025 21:43:13 +0100
Subject: [PATCH] model_1 added

---
 model_1.py | 787 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 787 insertions(+)
 create mode 100644 model_1.py

diff --git a/model_1.py b/model_1.py
new file mode 100644
index 0000000..7076feb
--- /dev/null
+++ b/model_1.py
@@ -0,0 +1,787 @@
+##### Imports #####
+
+
+from re import S
+import math
+import streamlit as st
+import matplotlib as plt
+import pandas as pd
+import textwrap
+import statsmodels.formula.api as smf
+import statsmodels.tsa.api as smt
+import statsmodels.api as sm
+import scipy.stats as scs
+import numba
+import numpy as np
+import seaborn as sns
+
+import matplotlib.cm as cm
+from mpl_toolkits.mplot3d import Axes3D
+from matplotlib import colors
+import matplotlib.pyplot as plt
+import altair as alt
+from matplotlib.colors import ListedColormap
+
+##### Plotly ######
+import plotly.express as px
+import plotly.graph_objects as go
+import plotly.figure_factory as ff
+import plotly.graph_objects as go
+import chart_studio
+from plotly import tools
+from plotly.subplots import make_subplots
+import plotly.colors as pc
+
+import time #from datetime import datetime
+import datetime
+
+
+from scipy.fftpack import rfft
+from scipy.stats import boxcox
+from sklearn.cluster import AgglomerativeClustering, KMeans
+from sklearn.metrics import davies_bouldin_score
+from sklearn_extra.cluster import KMedoids
+from sklearn.metrics import calinski_harabasz_score
+from scipy.cluster.hierarchy import single, complete, average, ward, dendrogram, linkage
+from sklearn.metrics.cluster import contingency_matrix
+from sklearn.manifold import TSNE
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.cluster import KMeans
+from sklearn.impute import SimpleImputer
+
+from datetime import timedelta #from datetime import datetime
+
+
+# Algorithms
+from tslearn.barycenters import dtw_barycenter_averaging
+from tslearn.clustering import TimeSeriesKMeans
+#from sktime.distances import dtw_distance
+from dtaidistance import clustering, dtw
+#from fcmeans import FCM
+
+
+# IMplementation for pyclustering kmeans
+from pyclustering.cluster.kmeans import kmeans
+from pyclustering.cluster.center_initializer import random_center_initializer
+from pyclustering.cluster.encoder import type_encoding
+from pyclustering.cluster.encoder import cluster_encoder
+from pyclustering.utils.metric import distance_metric
+from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
+from pyclustering.cluster.fcm import fcm
+from sklearn.metrics import pairwise_distances
+from validclust import cop, dunn
+
+
+# Preprocessing
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics.cluster import contingency_matrix
+from tslearn.clustering import TimeSeriesKMeans
+from tslearn.preprocessing import TimeSeriesScalerMeanVariance
+from netdata_pandas.data import get_data, get_chart_list
+from am4894plots.plots import plot_lines, plot_lines_grid
+from matplotlib.patches import Ellipse
+from sklearn import preprocessing
+
+#from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score, silhouette_samples
+from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer
+from sklearn.model_selection import train_test_split
+#from statsmodels.tsa.arima_model import ARIMA
+
+import warnings  # `do not disturbe` mode
+
+
+warnings.filterwarnings('ignore')
+
+####### DEF for null values #####
+
+def null_values(df):
+    null_test = (df.isnull().sum(axis=0) / len(df)).sort_values(ascending=False).index
+    null_data_test = pd.concat([
+        df.isnull().sum(axis=0),
+        (df.isnull().sum(axis=0) / len(df)).sort_values(ascending=False),
+        df.loc[:, df.columns.isin(list(null_test))].dtypes], axis=1)
+    null_data_test = null_data_test.rename(columns={0: '# null',
+                                                    1: '% null',
+                                                    2: 'type'}).sort_values(ascending=False, by='% null')
+    null_data_test = null_data_test[null_data_test["# null"] != 0]
+
+    return null_data_test
+
+
+
+def type(df):
+    return pd.DataFrame(df.dtypes, columns=['Type'])
+
+def preprocessing_meanvar(df):
+    X = TimeSeriesScalerMeanVariance().fit_transform(X)
+    df = pd.DataFrame(X.reshape(df.shape), columns=df.columns, index=df.index)
+    return df
+
+
+def purity_score(y_true, y_pred):
+    # compute contingency matrix (also called confusion matrix)
+    confusion_matrix = contingency_matrix(y_true, y_pred)
+    # return purity
+    return np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix)
+
+
+
+
+# Read a file and do EDA
+def read_csv_and_auto_update_time_interval(uploaded_file):
+    if uploaded_file is not None:
+        data = pd.read_csv(uploaded_file)
+        df = data.copy()
+
+        # Convert the 'Time' column to datetime
+        df['Time'] = pd.to_datetime(df['Time'])
+
+        # Infer the time interval from the 'Time' column
+        time_interval = (df['Time'].diff().mean()).seconds
+        time_interval_str = f"{time_interval // 60}T"
+
+        # Calculate the start date and end date based on the inferred time interval
+        start_date = df['Time'].min()
+        end_date = df['Time'].max()
+
+        # Use the inferred time interval to generate the date range
+        date_range = pd.date_range(start=start_date, end=end_date, freq=time_interval_str)
+
+        # Update the 'Date' column with the new date range
+        df['Date'] = date_range
+
+        df['Hour'] = df['Date'].dt.hour
+        df['Days'] = df['Date'].dt.day
+        df['Weekday'] = df['Date'].dt.weekday
+        df['Month'] = pd.DatetimeIndex(df['Date']).month
+
+    return df, time_interval_str
+
+
+
+def time_series_analysis(uploaded_file, display_full):
+    # st.subheader('2. Time Series Analysis')
+    df, time_interval = read_csv_and_auto_update_time_interval(uploaded_file)
+    all_cols = df.columns.tolist()
+    nums = df.select_dtypes(include=np.number).columns.tolist()[:4]
+    dates = list(filter(lambda x: 'date' in x.lower(), df.columns))
+    no_dates = list(filter(lambda x: 'date' not in x.lower(), df.columns))
+    df1 = pd.DataFrame()  # Initialize df1
+    if display_full:
+
+        help_text6 = help_text6 = "**Please choose one of Ess active power. Grid active power. Consumption active power. Production active power**"
+        valu_col = st.selectbox(
+            'Pick which data column you want for analysis?', (nums), 1, help=help_text6)
+        
+        dat_col = dates[0]
+
+        piv_col = 'Month'  # Replace this with the actual column name for pivot
+
+        dat_plt = st.selectbox(
+            'Pick the TIME for PLOTS in your data?',
+            ('YEAR', 'YEAR-MONTH', 'Days', 'Hour', 'Month', 'Weekday'),
+            index=3
+        )
+
+        filt_col = 'Weekday'  # Replace this with the actual column name for filtering
+
+        piks = df[filt_col].unique()
+
+        filtrs = st.multiselect('Select values to be included', piks, default=piks)
+
+        df_filtered = df[df[filt_col].isin(filtrs)]
+
+        df_filtered['YEAR'] = pd.to_datetime(df_filtered[dat_col]).dt.year
+        df_filtered['YEAR-MONTH'] = pd.to_datetime(df_filtered[dat_col]).dt.to_period('M')
+
+        df1 = pd.pivot_table(df_filtered, values=valu_col, index=dat_plt, columns=piv_col)
+        # Normalize the 'Consumption active power' column between 0 and 1 for each hour
+        df1 = (df1 - df1.min()) / (df1.max() - df1.min())
+
+        # Create a list of month names
+        month_names = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
+                # Check the length of the DataFrame columns and the month names list
+        if len(df1.columns) == len(month_names):
+            # Set the month names as the columns
+            df1.columns = month_names
+        else:
+            st.error("Length mismatch: The number of columns in the DataFrame does not match the length of the month names list.") 
+        #df1.columns = month_names
+        #df1 = df1.reset_index()
+
+        #df1.set_index(dat_plt, inplace=True)
+        #df1.index = df1.index.astype(str)
+
+        se = df1.sum()
+
+        start_date, end_date = st.select_slider(
+            'Select the Hours range',
+            options=df1.index,
+            value=(df1.index[0], df1.index[-1])
+        )
+
+        st.write('You have selected between', start_date, ' and ', end_date, 'hours')
+        
+        # if st.button('Series PLot from selection'):
+        fig = px.line(df1.loc[start_date:end_date])
+        # Customize the layout if needed
+        fig.update_layout(
+            title="Total Energy Consumption",
+            xaxis_title="Hour",
+            yaxis_title="Consumption Active Power in kW"
+        )
+        st.plotly_chart(fig)
+        
+
+
+    else:
+        # Select columns with numeric data
+        nums = df.select_dtypes(include=np.number).columns.tolist()
+        
+        # Initialize df1
+        df1 = pd.DataFrame()
+        
+        # Set the value column and filters
+        valu_col = 'Consumption active power'  # Use the 'Consumption active power' column for the plot
+        dat_plt = 'Hour'  # X-axis will represent 'Hour'
+        filt_col = 'Month'  # Filter data by 'Month'
+        
+        # Create a pivot table where the 'Hour' is the index, and columns are the months
+        df1 = pd.pivot_table(df, values=valu_col, index=dat_plt, columns=filt_col)
+        
+        # Normalize the 'Consumption active power' column between 0 and 1 for each hour
+        df1 = (df1 - df1.min()) / (df1.max() - df1.min())
+        
+        # Create a plotly figure
+        fig = go.Figure()
+
+        # Add a trace for each month
+        for month in df1.columns:
+            fig.add_trace(go.Scatter(
+                x=df1.index,  # Hour of the day
+                y=df1[month],  # Normalized consumption power
+                mode='lines',  # Line plot
+                name=month  # Month name as the trace name
+            ))
+
+        # Update layout for better visualization
+        fig.update_layout(
+            title="Normalized Consumption Power by Hour of the Day",
+            xaxis_title="Hour of the Day",
+            yaxis_title="Normalized Consumption Power (kW)",
+            legend_title="Month",
+            template="plotly_dark",  # Dark theme, you can change it to "plotly" for a light theme
+        )
+
+        # Show the plot
+        st.plotly_chart(fig)
+
+    return df, df1
+
+
+
+
+def choose_parameters():
+    help_text1 = "**Selecting the number of clusters** involves finding the ideal count of distinct groups within a dataset during cluster analysis."
+    n_clusters = st.sidebar.number_input("Choose the number of Clusters", 2, 50, step=1, key='no_of_clusters', help=help_text1)
+    
+    help_text2 = "**By initializing a random seed**, it is possible to reproduce the same sequence of random numbers, which is crucial for ensuring the reproducibility of experiments."
+    number = st.sidebar.number_input('Random Seed', min_value=10, step=15, help=help_text2)
+    
+    mod_l = ['Time Series K-Means', 'hierarchical', 'DBA', 'KMedoids']
+    help_text3 = "**Time Series**: Time series refers to a sequence of data points recorded at specific time intervals. This data format is used to analyze trends, patterns, and behaviors over time.\n**K-Means**: K-Means is a popular clustering algorithm that partitions data into 'k' clusters, aiming to minimize the sum of squared distances between the data points and the cluster centroids.\n**Hierarchical**: Hierarchical clustering is an algorithm that builds a hierarchy of clusters by either merging or splitting them successively based on the similarity between data points.\n**DBA (Dynamic Time Warping Barycenter Averaging)**: DBA is a technique used to align time series data, finding a representative average pattern or prototype from a set of time series sequences.\n**KMedoids**: KMedoids is a clustering algorithm similar to K-Means, but instead of using the mean value of the points in a cluster as the center, it uses the most centrally located point as the medoid."
+    mod_choice = st.sidebar.selectbox("Select model", mod_l, help=help_text3)
+    
+    help_text4 = "**This parameter helps in regulating the convergence of the algorithm**, enabling the user to balance computational efficiency with the quality of results."
+    iter = st.sidebar.number_input('Max Iteration', min_value=10, step=5, help=help_text4)
+    
+    help_text5 = "**It is a parameter used in k-means clustering** to improve the quality of the final clustering by running the algorithm multiple times and selecting the solution with the lowest inertia (or within-cluster sum of squares)."
+    n_init = st.sidebar.number_input('n init', min_value=2, step=2, help=help_text5)
+    
+    return n_clusters, number, mod_choice, iter, n_init
+
+
+def clustering(df1, n_clusters, number, mod_choice, iter, n_init):
+    Xx = df1.copy()
+
+    # Check for and handle missing values
+    if Xx.isna().sum().sum() > 0:
+        imputer = SimpleImputer(strategy='mean')
+        Xx = imputer.fit_transform(Xx)
+
+    # Scale the data
+    scaler = MinMaxScaler()
+    Xx = scaler.fit_transform(Xx)
+
+    kmeans = KMeans(n_clusters)
+    kmeans.fit(Xx)
+    
+    clustered_data = df1.copy()  # Create a new DataFrame
+    clustered_data['cluster_pred'] = kmeans.fit_predict(Xx)  # Add the 'cluster_pred' column
+    
+    x_array = np.array(df1)
+    
+    scaler = MinMaxScaler()
+    x_scaled = scaler.fit_transform(x_array)
+
+    #if st.sidebar.checkbox("Dynamic Clustering"):
+    #st.subheader("Dynamic Clusterng ")
+    ## Dynamic Clustering
+    norm = True  #  data to 0-1 range normalizing
+    preprocessing_meanvar = False  # Set to True if use TimeSeriesScalerMeanVariance
+
+    # Fast Fourier Transform (fft) and Clustering of Time Series
+    # Set to True if to do the clustering based on fft transformation of X : can use in future development
+    preprocessing_fft = False
+    preprocessing_sqrt = True
+    preprocessing_log = False
+    model = mod_choice  # ['Partitional', 'hierarchical', 'DBA', 'KMedoids'] # you can add DTW, soft-DTW, kshape too
+    min_n = 0  # only interested in clusters with min_n or more members
+
+    # throwing some errors
+    df2= df1.copy()
+
+    df2 = df2.loc[:, ~df2.columns.duplicated()]
+
+    # To drop any empty columns
+    df2 = df2.dropna(axis=1, how='all')
+
+    # use if necessary to try remove any N/A values and use forward fill and backward fill to
+    df2 = df2.ffill().bfill()
+    
+    df2 = df2.fillna(df2.mean())
+
+    # do sqrt
+    if preprocessing_sqrt:
+        df2 = df2.apply(lambda col: np.sqrt(col))
+
+    # do log if set True
+    if preprocessing_log:
+        df2 = df2.apply(lambda col: np.log1p(col))
+
+    # take differences if specified
+    if np.diff:
+        df2 = df2.diff()
+
+    start_time = time.perf_counter()
+
+    # normalizing data once
+    if norm:
+        df2 = (df2 - df2.min()) / (df2.max() - df2.min())
+
+    # drop any empty columns that may remain
+    df2 = df2.dropna(axis=1, how='all')
+
+    df2.head()
+    # Handle missing values with SimpleImputer
+    imputer = SimpleImputer(strategy='mean')
+    df2 = imputer.fit_transform(df2)
+
+    # Get values to cluster on
+    if preprocessing_fft:
+        X = rfft(df2).transpose()
+    else:
+        X = df2.transpose()
+
+    #iter = st.sidebar.number_input('Max Iteration', min_value=10, step=5)
+
+    #n_init = st.sidebar.number_input('n init', min_value=2, step=2)
+
+
+    if (mod_choice == 'Time Series K-Means'):
+        #dis = ["euclidean", "dtw", "softdtw"]
+        #dis_choice = st.sidebar.selectbox("Select Distance measure", dis)
+        mod_choice = TimeSeriesKMeans(n_clusters=n_clusters, max_iter=iter, n_init=n_init, random_state=number).fit(X)
+
+
+    elif (mod_choice == 'DTW'):
+        dis = ["euclidean", "dtw", "softdtw"]
+        dis_choice = st.sidebar.selectbox("Select Distance measure", dis)
+        mod_choice = TimeSeriesKMeans(n_clusters=n_clusters, metric=dis_choice, max_iter=iter, n_init=n_init, random_state=number).fit(X)
+
+
+    elif (mod_choice == 'softdtw'):
+        dis = ["euclidean", "dtw", "softdtw"]
+        dis_choice = st.sidebar.selectbox("Select Distance measure", dis)
+        mod_choice = TimeSeriesKMeans(n_clusters=n_clusters, metric=dis_choice, max_iter=iter, n_init=n_init, random_state=number).fit(X)
+
+    elif (mod_choice == 'hierarchical'):
+        dis = ["euclidean", "manhattan", "cosine"]
+        dis_choice = st.sidebar.selectbox("Select Affinity measure", dis)
+        link = ['single', 'complete', 'average', 'ward']
+        link_choice = st.sidebar.selectbox("Select Linkage", link)
+        mod_choice = AgglomerativeClustering(distance_threshold=None, n_clusters=n_clusters, affinity=dis_choice,
+                                                linkage=link_choice).fit(X)
+        if st.checkbox("Plot Dendrogram"):
+            #plots_title = ( 'The Dendrogram for %d' % link_choice )
+
+            if link_choice == 'single':
+                Z = linkage(X, 'single')
+            elif link_choice == 'complete':
+                Z = linkage(X, 'complete')
+            elif link_choice == 'average':
+                Z = linkage(X, 'average')
+            elif link_choice == 'ward':
+                Z = linkage(X, 'ward')
+
+            fig = ff.create_dendrogram(Z)
+            #fig.update_layout(color_threshold=1.5)
+            st.plotly_chart(fig)
+
+    elif (mod_choice == 'DBA'):
+        dis = ["euclidean", "dtw", "softdtw"]
+        dis_choice = st.sidebar.selectbox("Select Distance measure", dis)
+        mod_choice = TimeSeriesKMeans(n_clusters=n_clusters, n_init=n_init, metric=dis_choice, verbose=True,
+                                        max_iter_barycenter=iter,
+                                    random_state=number).fit(X)
+
+    elif (mod_choice == 'KMedoids'):
+        dis = ["euclidean", "manhattan", "chebyshev", "canberra", "minkowski", "cosine"]
+        dis_choice = st.sidebar.selectbox("Select Distance measure", dis)
+        mod_choice = KMedoids(n_clusters=n_clusters, metric=dis_choice, method='pam', random_state=number).fit(X)
+
+    silhouette_avrg = silhouette_score(X, mod_choice.labels_)
+    #print("Latest Silhouette ",round(silhouette_avrg,2), " average is.")
+
+    df2 = pd.DataFrame(df2)
+    #  Generate a darafreame along with metrics and their cluster labels
+    cl_df = pd.DataFrame(list(zip(df2.columns, mod_choice.labels_)), columns=['metric', 'cluster'])
+
+    # Generating helper dictionaries and lists
+    cl_met = cl_df.groupby(['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict()
+    cl_count = cl_df['cluster'].value_counts().to_dict()
+    a_clust = [cluster for cluster in cl_count]
+    drp_clust = [cluster for cluster in cl_count if cl_count[cluster] < min_n]
+    list_clust = [cluster for cluster in cl_count if cl_count[cluster] >= min_n]
+    clst_final= np.array(list_clust)
+
+    data2 = np.array([mod_choice.labels_])
+
+    d_cl_qua = {}
+    for clst_i in a_clust:
+        cr_x = df2[cl_met[clst_i]].corr().abs().values
+        cr_x_mean = round(cr_x[np.triu_indices(cr_x.shape[0], 1)].mean(), 2)
+        d_cl_qua[clst_i] = cr_x_mean
+
+    # get quality score for each cluster
+    #if st.checkbox("View CVI's"):
+
+    #clst_eqal_choice == 'sil'
+    silhouette_scores = silhouette_samples(X, mod_choice.labels_)
+    cl_df_sil = pd.DataFrame(
+            list(zip(mod_choice.labels_, silhouette_scores)),
+            columns=['cluster', 'silhouette_score'])
+    cl_df_sil = cl_df_sil.groupby(['cluster']).max()
+    # print("Cluster Sil ",cl_df_sil, " score.")
+    d_cl_qua = cl_df_sil.to_dict()['silhouette_score']
+    # print("1. SIlhouette index ", silhouette_avrg, " is.")
+    #print("2. Latest Silhouette ", round(silhouette_avrg, 2), " average is.")
+
+    # build cluster level df with some cluster metadata
+    cl_df_met = pd.DataFrame.from_dict(cl_count, orient='index', columns=['n'])
+    cl_df_met.index.names = ['cluster']
+    #d_cl_qua = silhouette_avrg.to_dict()['silhouette_score']
+    cl_df_met['quality_score'] = cl_df_met.index.map(d_cl_qua)
+    cl_df_met = cl_df_met.sort_values('quality_score', ascending=False)
+
+    return df2, X, list_clust, mod_choice, cl_df
+
+
+###   Function to visualize all the clusters
+def visualization_clusters(df2, mod_choice, list_clust):
+    # Create a figure to plot all clusters together
+    all_clusters_fig = go.Figure()
+
+    clustered_data = df2.groupby(mod_choice.labels_, axis=1)
+    print("######################")
+    st.write(clustered_data)
+
+    # Filter out clusters that are not in list_clust
+    clustered_data = {k: v for k, v in clustered_data if k in list_clust}
+
+    # Function to generate random data points between two lines
+    def generate_random_between_lines(upper_line, lower_line, num_points=24, scale_factor=1.0):
+        random_data = np.random.uniform(lower_line, upper_line, num_points) * scale_factor
+        return random_data
+
+    # Create lists to hold the mean, max, and min values for each cluster
+    mean_data = []
+    upper_quartile = []
+    lower_quartile = []
+
+    for cluster_id in list_clust:
+        mean_data.append(clustered_data[cluster_id].mean(axis=1))
+        upper_quartile.append(clustered_data[cluster_id].quantile(0.75, axis=1))
+        lower_quartile.append(clustered_data[cluster_id].quantile(0.25, axis=1))
+
+    # Sort the clusters in ascending order
+    sorted_clusters = sorted(zip(list_clust, mean_data, upper_quartile, lower_quartile), key=lambda x: x[0])
+    mean_aggregated = np.mean([clustered_data[cluster_id].mean(axis=1) for cluster_id in list_clust], axis=0).tolist()
+    max_aggregated = np.max([clustered_data[cluster_id].quantile(0.75, axis=1) for cluster_id in list_clust], axis=0).tolist()
+    min_aggregated = np.min([clustered_data[cluster_id].quantile(0.25, axis=1) for cluster_id in list_clust], axis=0).tolist()
+
+    all_clusters_fig = go.Figure()
+    for cluster_id in list_clust:
+        all_clusters_fig.add_trace(go.Scatter(x=df2.columns, y=clustered_data[cluster_id].mean(axis=1).tolist(), mode='lines', name=f'Cluster {cluster_id}'))
+
+    all_clusters_fig.add_trace(go.Scatter(x=df2.columns, y=mean_aggregated, mode='lines', name='Mean', line=dict(color='black')))
+    all_clusters_fig.add_trace(go.Scatter(x=df2.columns, y=max_aggregated, mode='lines', name='Max', line=dict(color='lightgray')))
+    all_clusters_fig.add_trace(go.Scatter(x=df2.columns, y=min_aggregated, mode='lines', name='Min', fill='tonexty', line=dict(color='lightgray')))
+    all_clusters_fig.update_layout(xaxis_title="Metrics", yaxis_title="Values", title="All Clusters along with mean, min, max lines", showlegend=True)
+
+    st.plotly_chart(all_clusters_fig)
+
+    # Function to handle button click
+    def on_button_click():
+        for i, (cluster_id, cluster_mean, cluster_upper, cluster_lower) in enumerate(sorted_clusters):
+            fig = go.Figure()
+
+            # Add original cluster line
+            fig.add_trace(go.Scatter(x=df2.columns, y=cluster_mean, mode='lines', name=f'Cluster {cluster_id}'))
+            # Original mean line
+            #mean = np.mean([mean_data[cluster_id] for cluster_id in list_clust], axis=0)
+            #fig.add_trace(go.Scatter(x=df2.columns, y=mean, mode='lines', name='Mean', line=dict(color='black')))
+
+            # Generate random data points between upper and lower bounds
+            new_max_data = generate_random_between_lines(cluster_upper, cluster_mean)
+            new_min_data = generate_random_between_lines(cluster_mean, cluster_lower)
+
+            # Update the max and min curves
+            updated_max_line = np.mean([cluster_upper, new_max_data], axis=0)
+            updated_min_line = np.mean([cluster_lower, new_min_data], axis=0)
+
+            # Update the figure with the new min and max curves
+            #fig = update_min_max_curves(fig, df2.columns, updated_max_line, updated_min_line, cluster_id)
+            #fig.add_trace(go.Scatter(x=df2.columns, y=updated_max_line, mode='lines', name='New_Max', line=dict(width=3)))
+            #fig.add_trace(go.Scatter(x=df2.columns, y=updated_min_line, mode='lines', name='New_Min', line=dict(width=3)))
+
+            # Add aggregated max line
+            new_aggregated_max_line = np.mean([np.max(upper_quartile, axis=0), updated_max_line], axis=0)
+            fig.add_trace(go.Scatter(x=df2.columns, y=new_aggregated_max_line, mode='lines', name='Max', line=dict(width=3, dash='dash')))
+
+            # Add aggregated min line
+            new_aggregated_min_line = np.mean([np.min(lower_quartile, axis=0), updated_min_line], axis=0)
+            fig.add_trace(go.Scatter(x=df2.columns, y=new_aggregated_min_line, mode='lines', name='Min', fill='tonexty', line=dict(width=3, dash='dash')))
+
+            # Set layout for each subplot
+            plot_title = f"Cluster {cluster_id} with Random Values"
+            fig.update_layout(
+                #xaxis_title="Metrics",
+                yaxis_title="Total electricity consumption kW",
+                title=plot_title,
+            )
+
+            # Show the plot for each cluster separately
+            st.plotly_chart(fig, use_container_width=True)
+
+    # Add a button to generate random numbers
+    if st.button('Generate Energy Scenario with random values'):
+        on_button_click()
+
+
+
+def plot_energy_consumption(data, combined=True):
+    # Create a DataFrame with the necessary columns
+    df1 = data[['Hour', 'Month', 'Consumption active power']]
+
+    # Normalize the 'Consumption active power' between 0 and 1
+    df1['Consumption active power'] = (df1['Consumption active power'] - df1['Consumption active power'].min()) / (df1['Consumption active power'].max() - df1['Consumption active power'].min())
+
+    # Create a pivot table where 'Hour' is the index, and 'Month' is the column
+    df1_pivot = pd.pivot_table(df1, values='Consumption active power', index='Hour', columns='Month')
+
+    # Normalize the 'Consumption active power' column between 0 and 1 for each hour
+    df1_pivot = (df1_pivot - df1_pivot.min()) / (df1_pivot.max() - df1_pivot.min())
+
+    # Create a dictionary to map month numbers to their corresponding names
+    month_map = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 
+                 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
+
+    if combined:
+        # Create a figure
+        fig = go.Figure()
+
+        # Iterate over months (columns) and add traces to the figure
+        for month_num in df1_pivot.columns:
+            df_month = df1_pivot[month_num]
+            month_name = month_map[month_num]
+            fig.add_trace(go.Scatter(x=df_month.index, y=df_month, mode='lines', name=month_name))
+
+        # Update the layout
+        help_info = "*Double click on a month for which you want to peek*"
+        fig.update_layout(title='Energy Consumption by Hour for Different Months',
+                          xaxis_title='Hour of the Day',
+                          yaxis_title='Total Electricity Consumption (kW)')
+
+        # Add an info icon
+        fig.add_annotation(
+            x=0.95,
+            y=0.95,
+            xref='paper',
+            yref='paper',
+            text='ℹ️',
+            showarrow=False,
+            font=dict(size=24),
+            hovertext=help_info
+        )
+
+        # Display the plot using Streamlit
+        st.plotly_chart(fig)
+
+    else:
+        # Create a subplot with 12 small plots
+        fig = make_subplots(rows=4, cols=3, subplot_titles=[month_map[i+1] for i in range(12)], shared_xaxes=True, shared_yaxes=True)
+
+        # Iterate over months (columns) and add a subplot for each
+        for i, month_num in enumerate(df1_pivot.columns):
+            row = i // 3 + 1
+            col = i % 3 + 1
+            df_month = df1_pivot[month_num]
+            month_name = month_map[month_num]
+            fig.add_trace(go.Scatter(x=df_month.index, y=df_month, mode='lines', name=month_name), row=row, col=col)
+
+        # Update the layout
+        fig.update_layout(height=800, width=1000, title_text='Energy Consumption by Hour for Each Month', showlegend=False)
+        st.plotly_chart(fig)
+
+
+
+def merge_and_create_dataframe(df, cl_df):
+    # Renaming the 'metric' column to 'Month' and adjusting the values
+    cl_df.rename(columns={'metric': 'Month'}, inplace=True)
+    cl_df['Month'] = cl_df['Month'] % 12 + 1  # Modifying the values to be from 1 to 12
+
+    # Merging the dataframes
+    df_merged = pd.merge(df, cl_df, on='Month', how='left')
+
+    # Selecting only the required columns
+    df_merged = df_merged[['Month', 'Days', 'cluster']]
+
+    return df_merged
+
+# Function to map numerical months to month names
+def map_month(num):
+    months = {
+        1: 'January',
+        2: 'February',
+        3: 'March',
+        4: 'April',
+        5: 'May',
+        6: 'June',
+        7: 'July',
+        8: 'August',
+        9: 'September',
+        10: 'October',
+        11: 'November',
+        12: 'December'
+    }
+    return months[num]
+
+
+def download_df(df_merged):
+# IMPORTANT: Cache the conversion to prevent computation on every rerun
+    return df_merged.to_csv().encode('utf-8')
+
+
+
+
+### main application
+def app():
+    st.sidebar.title('Energy Scenario Planning')
+    activities = ["VISUALIZE THE CLUSTERS", "EXPLORE YOURSELF"]
+    choice = st.sidebar.selectbox("Would you like to try yourself: Select other", activities)
+
+    uploaded_file = st.file_uploader("Right now we only accept CSV file format", type="csv")
+    global df, df1  # Global declaration
+
+    if uploaded_file is not None:
+        if choice == "VISUALIZE THE CLUSTERS":
+            df, df1 = time_series_analysis(uploaded_file, display_full=False)
+            df2, X, list_clust, mod_choice, cl_df = clustering(df1, n_clusters=5, number=10, mod_choice='Time Series K-Means', iter=15, n_init=2)
+            #print(df2)
+            # Plot with all 12 months combined
+            #plot_energy_consumption(df)
+            # Plot with each month separately
+            plot_energy_consumption(df, combined=False)
+
+            visualization_clusters(df2, mod_choice, list_clust)
+            df_merged = merge_and_create_dataframe(df, cl_df)
+            #print(df_merged)
+            """
+            # Prepare data for the grouped bar chart
+            new_bar_data = df_merged.copy()
+            new_bar_data.columns = ['Month', 'Days', 'Cluster']
+            new_bar_data['Month'] = new_bar_data['Month'].apply(map_month)
+
+            # Draw the grouped bar chart
+            fig = go.Figure()
+            for month in new_bar_data['Month'].unique():
+                data = new_bar_data[new_bar_data['Month'] == month]
+                #fig.add_trace(go.Bar(x=data['Month'], y=data['Days'], name=month))
+                fig.add_trace(go.Bar(x=data['Cluster'], y=data['Days'], name=month))
+
+            fig.update_layout(barmode='stack', title='Data Dispersion in Each Cluster', xaxis_title='clusters', yaxis_title='Days + energy(kW)')
+            st.plotly_chart(fig)
+            """
+            # Step 1: Aggregating the data by 'Month', 'Days', and 'cluster'
+            aggregated_data = df_merged.groupby(['Month', 'Days', 'cluster']).size().unstack(fill_value=0)
+
+            # Step 2: Resetting the index for the DataFrame to plot using Plotly
+            aggregated_data.reset_index(inplace=True)
+
+            # Step 3: Melting the DataFrame for Plotly compatibility
+            aggregated_data_melted = aggregated_data.melt(id_vars=['Month', 'Days'], var_name='cluster', value_name='count')
+
+            # Step 4: Create the stacked bar chart using Plotly
+            fig = px.bar(
+                aggregated_data_melted,
+                x="Month",  # Month on the x-axis
+                y="count",  # Count of clusters on the y-axis
+                color="cluster",  # Different cluster numbers shown as different colors
+                facet_row="Days",  # Facet rows for different days (optional, can remove if not needed)
+                labels={'Month': 'Month', 'count': 'Cluster Count', 'cluster': 'Cluster Number'},
+                title="Stacked Bar Chart of Cluster Numbers by Day for Each Month",
+                color_discrete_sequence=px.colors.qualitative.Set1
+            )
+
+            # Step 5: Update layout for stacked bars
+            fig.update_layout(barmode='stack')
+
+            # Step 6: Display the plot in Streamlit
+            st.title('Cluster Data Aggregation and Visualization')
+            st.write("This chart shows the number of clusters for each day of the month across the year.")
+            st.plotly_chart(fig)
+
+            ####   Download icon for new generated data
+            csv = download_df(df_merged)
+            st.download_button(
+            label="Download new Scenario data as CSV",
+            data=csv,
+            file_name='new_df.csv',
+            mime='text/csv',
+            )
+
+        elif choice == "EXPLORE YOURSELF":
+            df, df1 = time_series_analysis(uploaded_file, display_full=True)
+            n_clusters, number, mod_choice, iter, n_init = choose_parameters()
+            df2, X, list_clust, mod_choice, cl_df = clustering(df1, n_clusters, number, mod_choice, iter, n_init)
+            visualization_clusters(df2, mod_choice, list_clust)
+
+            ####   Download icon for new generated data
+            df_merged = merge_and_create_dataframe(df, cl_df)
+            csv = download_df(df_merged)
+            st.download_button(
+            label="Download new Scenario data as CSV",
+            data=csv,
+            file_name='new_df.csv',
+            mime='text/csv',
+            )
+
+
+
+    st.markdown("***Developed in collaboration with Prof. Xavier Valdes, Krishna Kumar Oli, and Kishankumar Vaidya***")
+
+
+if __name__ == '__main__':
+    app()
-- 
GitLab