From 553b69f2470725870fa3e0449fa52e3577215cb5 Mon Sep 17 00:00:00 2001 From: Krishna Oli <krishna.oli@stud.th-deg.de> Date: Mon, 17 Mar 2025 21:44:56 +0100 Subject: [PATCH] Upload New File --- model_1.py | 787 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 787 insertions(+) create mode 100644 model_1.py diff --git a/model_1.py b/model_1.py new file mode 100644 index 0000000..7076feb --- /dev/null +++ b/model_1.py @@ -0,0 +1,787 @@ +##### Imports ##### + + +from re import S +import math +import streamlit as st +import matplotlib as plt +import pandas as pd +import textwrap +import statsmodels.formula.api as smf +import statsmodels.tsa.api as smt +import statsmodels.api as sm +import scipy.stats as scs +import numba +import numpy as np +import seaborn as sns + +import matplotlib.cm as cm +from mpl_toolkits.mplot3d import Axes3D +from matplotlib import colors +import matplotlib.pyplot as plt +import altair as alt +from matplotlib.colors import ListedColormap + +##### Plotly ###### +import plotly.express as px +import plotly.graph_objects as go +import plotly.figure_factory as ff +import plotly.graph_objects as go +import chart_studio +from plotly import tools +from plotly.subplots import make_subplots +import plotly.colors as pc + +import time #from datetime import datetime +import datetime + + +from scipy.fftpack import rfft +from scipy.stats import boxcox +from sklearn.cluster import AgglomerativeClustering, KMeans +from sklearn.metrics import davies_bouldin_score +from sklearn_extra.cluster import KMedoids +from sklearn.metrics import calinski_harabasz_score +from scipy.cluster.hierarchy import single, complete, average, ward, dendrogram, linkage +from sklearn.metrics.cluster import contingency_matrix +from sklearn.manifold import TSNE +from sklearn.preprocessing import MinMaxScaler +from sklearn.cluster import KMeans +from sklearn.impute import SimpleImputer + +from datetime import timedelta #from datetime import datetime + + +# Algorithms +from tslearn.barycenters import dtw_barycenter_averaging +from tslearn.clustering import TimeSeriesKMeans +#from sktime.distances import dtw_distance +from dtaidistance import clustering, dtw +#from fcmeans import FCM + + +# IMplementation for pyclustering kmeans +from pyclustering.cluster.kmeans import kmeans +from pyclustering.cluster.center_initializer import random_center_initializer +from pyclustering.cluster.encoder import type_encoding +from pyclustering.cluster.encoder import cluster_encoder +from pyclustering.utils.metric import distance_metric +from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer +from pyclustering.cluster.fcm import fcm +from sklearn.metrics import pairwise_distances +from validclust import cop, dunn + + +# Preprocessing +from sklearn.preprocessing import MinMaxScaler +from sklearn.metrics.cluster import contingency_matrix +from tslearn.clustering import TimeSeriesKMeans +from tslearn.preprocessing import TimeSeriesScalerMeanVariance +from netdata_pandas.data import get_data, get_chart_list +from am4894plots.plots import plot_lines, plot_lines_grid +from matplotlib.patches import Ellipse +from sklearn import preprocessing + +#from sklearn.cluster import KMeans +from sklearn.metrics import silhouette_score, silhouette_samples +from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer +from sklearn.model_selection import train_test_split +#from statsmodels.tsa.arima_model import ARIMA + +import warnings # `do not disturbe` mode + + +warnings.filterwarnings('ignore') + +####### DEF for null values ##### + +def null_values(df): + null_test = (df.isnull().sum(axis=0) / len(df)).sort_values(ascending=False).index + null_data_test = pd.concat([ + df.isnull().sum(axis=0), + (df.isnull().sum(axis=0) / len(df)).sort_values(ascending=False), + df.loc[:, df.columns.isin(list(null_test))].dtypes], axis=1) + null_data_test = null_data_test.rename(columns={0: '# null', + 1: '% null', + 2: 'type'}).sort_values(ascending=False, by='% null') + null_data_test = null_data_test[null_data_test["# null"] != 0] + + return null_data_test + + + +def type(df): + return pd.DataFrame(df.dtypes, columns=['Type']) + +def preprocessing_meanvar(df): + X = TimeSeriesScalerMeanVariance().fit_transform(X) + df = pd.DataFrame(X.reshape(df.shape), columns=df.columns, index=df.index) + return df + + +def purity_score(y_true, y_pred): + # compute contingency matrix (also called confusion matrix) + confusion_matrix = contingency_matrix(y_true, y_pred) + # return purity + return np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix) + + + + +# Read a file and do EDA +def read_csv_and_auto_update_time_interval(uploaded_file): + if uploaded_file is not None: + data = pd.read_csv(uploaded_file) + df = data.copy() + + # Convert the 'Time' column to datetime + df['Time'] = pd.to_datetime(df['Time']) + + # Infer the time interval from the 'Time' column + time_interval = (df['Time'].diff().mean()).seconds + time_interval_str = f"{time_interval // 60}T" + + # Calculate the start date and end date based on the inferred time interval + start_date = df['Time'].min() + end_date = df['Time'].max() + + # Use the inferred time interval to generate the date range + date_range = pd.date_range(start=start_date, end=end_date, freq=time_interval_str) + + # Update the 'Date' column with the new date range + df['Date'] = date_range + + df['Hour'] = df['Date'].dt.hour + df['Days'] = df['Date'].dt.day + df['Weekday'] = df['Date'].dt.weekday + df['Month'] = pd.DatetimeIndex(df['Date']).month + + return df, time_interval_str + + + +def time_series_analysis(uploaded_file, display_full): + # st.subheader('2. Time Series Analysis') + df, time_interval = read_csv_and_auto_update_time_interval(uploaded_file) + all_cols = df.columns.tolist() + nums = df.select_dtypes(include=np.number).columns.tolist()[:4] + dates = list(filter(lambda x: 'date' in x.lower(), df.columns)) + no_dates = list(filter(lambda x: 'date' not in x.lower(), df.columns)) + df1 = pd.DataFrame() # Initialize df1 + if display_full: + + help_text6 = help_text6 = "**Please choose one of Ess active power. Grid active power. Consumption active power. Production active power**" + valu_col = st.selectbox( + 'Pick which data column you want for analysis?', (nums), 1, help=help_text6) + + dat_col = dates[0] + + piv_col = 'Month' # Replace this with the actual column name for pivot + + dat_plt = st.selectbox( + 'Pick the TIME for PLOTS in your data?', + ('YEAR', 'YEAR-MONTH', 'Days', 'Hour', 'Month', 'Weekday'), + index=3 + ) + + filt_col = 'Weekday' # Replace this with the actual column name for filtering + + piks = df[filt_col].unique() + + filtrs = st.multiselect('Select values to be included', piks, default=piks) + + df_filtered = df[df[filt_col].isin(filtrs)] + + df_filtered['YEAR'] = pd.to_datetime(df_filtered[dat_col]).dt.year + df_filtered['YEAR-MONTH'] = pd.to_datetime(df_filtered[dat_col]).dt.to_period('M') + + df1 = pd.pivot_table(df_filtered, values=valu_col, index=dat_plt, columns=piv_col) + # Normalize the 'Consumption active power' column between 0 and 1 for each hour + df1 = (df1 - df1.min()) / (df1.max() - df1.min()) + + # Create a list of month names + month_names = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] + # Check the length of the DataFrame columns and the month names list + if len(df1.columns) == len(month_names): + # Set the month names as the columns + df1.columns = month_names + else: + st.error("Length mismatch: The number of columns in the DataFrame does not match the length of the month names list.") + #df1.columns = month_names + #df1 = df1.reset_index() + + #df1.set_index(dat_plt, inplace=True) + #df1.index = df1.index.astype(str) + + se = df1.sum() + + start_date, end_date = st.select_slider( + 'Select the Hours range', + options=df1.index, + value=(df1.index[0], df1.index[-1]) + ) + + st.write('You have selected between', start_date, ' and ', end_date, 'hours') + + # if st.button('Series PLot from selection'): + fig = px.line(df1.loc[start_date:end_date]) + # Customize the layout if needed + fig.update_layout( + title="Total Energy Consumption", + xaxis_title="Hour", + yaxis_title="Consumption Active Power in kW" + ) + st.plotly_chart(fig) + + + + else: + # Select columns with numeric data + nums = df.select_dtypes(include=np.number).columns.tolist() + + # Initialize df1 + df1 = pd.DataFrame() + + # Set the value column and filters + valu_col = 'Consumption active power' # Use the 'Consumption active power' column for the plot + dat_plt = 'Hour' # X-axis will represent 'Hour' + filt_col = 'Month' # Filter data by 'Month' + + # Create a pivot table where the 'Hour' is the index, and columns are the months + df1 = pd.pivot_table(df, values=valu_col, index=dat_plt, columns=filt_col) + + # Normalize the 'Consumption active power' column between 0 and 1 for each hour + df1 = (df1 - df1.min()) / (df1.max() - df1.min()) + + # Create a plotly figure + fig = go.Figure() + + # Add a trace for each month + for month in df1.columns: + fig.add_trace(go.Scatter( + x=df1.index, # Hour of the day + y=df1[month], # Normalized consumption power + mode='lines', # Line plot + name=month # Month name as the trace name + )) + + # Update layout for better visualization + fig.update_layout( + title="Normalized Consumption Power by Hour of the Day", + xaxis_title="Hour of the Day", + yaxis_title="Normalized Consumption Power (kW)", + legend_title="Month", + template="plotly_dark", # Dark theme, you can change it to "plotly" for a light theme + ) + + # Show the plot + st.plotly_chart(fig) + + return df, df1 + + + + +def choose_parameters(): + help_text1 = "**Selecting the number of clusters** involves finding the ideal count of distinct groups within a dataset during cluster analysis." + n_clusters = st.sidebar.number_input("Choose the number of Clusters", 2, 50, step=1, key='no_of_clusters', help=help_text1) + + help_text2 = "**By initializing a random seed**, it is possible to reproduce the same sequence of random numbers, which is crucial for ensuring the reproducibility of experiments." + number = st.sidebar.number_input('Random Seed', min_value=10, step=15, help=help_text2) + + mod_l = ['Time Series K-Means', 'hierarchical', 'DBA', 'KMedoids'] + help_text3 = "**Time Series**: Time series refers to a sequence of data points recorded at specific time intervals. This data format is used to analyze trends, patterns, and behaviors over time.\n**K-Means**: K-Means is a popular clustering algorithm that partitions data into 'k' clusters, aiming to minimize the sum of squared distances between the data points and the cluster centroids.\n**Hierarchical**: Hierarchical clustering is an algorithm that builds a hierarchy of clusters by either merging or splitting them successively based on the similarity between data points.\n**DBA (Dynamic Time Warping Barycenter Averaging)**: DBA is a technique used to align time series data, finding a representative average pattern or prototype from a set of time series sequences.\n**KMedoids**: KMedoids is a clustering algorithm similar to K-Means, but instead of using the mean value of the points in a cluster as the center, it uses the most centrally located point as the medoid." + mod_choice = st.sidebar.selectbox("Select model", mod_l, help=help_text3) + + help_text4 = "**This parameter helps in regulating the convergence of the algorithm**, enabling the user to balance computational efficiency with the quality of results." + iter = st.sidebar.number_input('Max Iteration', min_value=10, step=5, help=help_text4) + + help_text5 = "**It is a parameter used in k-means clustering** to improve the quality of the final clustering by running the algorithm multiple times and selecting the solution with the lowest inertia (or within-cluster sum of squares)." + n_init = st.sidebar.number_input('n init', min_value=2, step=2, help=help_text5) + + return n_clusters, number, mod_choice, iter, n_init + + +def clustering(df1, n_clusters, number, mod_choice, iter, n_init): + Xx = df1.copy() + + # Check for and handle missing values + if Xx.isna().sum().sum() > 0: + imputer = SimpleImputer(strategy='mean') + Xx = imputer.fit_transform(Xx) + + # Scale the data + scaler = MinMaxScaler() + Xx = scaler.fit_transform(Xx) + + kmeans = KMeans(n_clusters) + kmeans.fit(Xx) + + clustered_data = df1.copy() # Create a new DataFrame + clustered_data['cluster_pred'] = kmeans.fit_predict(Xx) # Add the 'cluster_pred' column + + x_array = np.array(df1) + + scaler = MinMaxScaler() + x_scaled = scaler.fit_transform(x_array) + + #if st.sidebar.checkbox("Dynamic Clustering"): + #st.subheader("Dynamic Clusterng ") + ## Dynamic Clustering + norm = True # data to 0-1 range normalizing + preprocessing_meanvar = False # Set to True if use TimeSeriesScalerMeanVariance + + # Fast Fourier Transform (fft) and Clustering of Time Series + # Set to True if to do the clustering based on fft transformation of X : can use in future development + preprocessing_fft = False + preprocessing_sqrt = True + preprocessing_log = False + model = mod_choice # ['Partitional', 'hierarchical', 'DBA', 'KMedoids'] # you can add DTW, soft-DTW, kshape too + min_n = 0 # only interested in clusters with min_n or more members + + # throwing some errors + df2= df1.copy() + + df2 = df2.loc[:, ~df2.columns.duplicated()] + + # To drop any empty columns + df2 = df2.dropna(axis=1, how='all') + + # use if necessary to try remove any N/A values and use forward fill and backward fill to + df2 = df2.ffill().bfill() + + df2 = df2.fillna(df2.mean()) + + # do sqrt + if preprocessing_sqrt: + df2 = df2.apply(lambda col: np.sqrt(col)) + + # do log if set True + if preprocessing_log: + df2 = df2.apply(lambda col: np.log1p(col)) + + # take differences if specified + if np.diff: + df2 = df2.diff() + + start_time = time.perf_counter() + + # normalizing data once + if norm: + df2 = (df2 - df2.min()) / (df2.max() - df2.min()) + + # drop any empty columns that may remain + df2 = df2.dropna(axis=1, how='all') + + df2.head() + # Handle missing values with SimpleImputer + imputer = SimpleImputer(strategy='mean') + df2 = imputer.fit_transform(df2) + + # Get values to cluster on + if preprocessing_fft: + X = rfft(df2).transpose() + else: + X = df2.transpose() + + #iter = st.sidebar.number_input('Max Iteration', min_value=10, step=5) + + #n_init = st.sidebar.number_input('n init', min_value=2, step=2) + + + if (mod_choice == 'Time Series K-Means'): + #dis = ["euclidean", "dtw", "softdtw"] + #dis_choice = st.sidebar.selectbox("Select Distance measure", dis) + mod_choice = TimeSeriesKMeans(n_clusters=n_clusters, max_iter=iter, n_init=n_init, random_state=number).fit(X) + + + elif (mod_choice == 'DTW'): + dis = ["euclidean", "dtw", "softdtw"] + dis_choice = st.sidebar.selectbox("Select Distance measure", dis) + mod_choice = TimeSeriesKMeans(n_clusters=n_clusters, metric=dis_choice, max_iter=iter, n_init=n_init, random_state=number).fit(X) + + + elif (mod_choice == 'softdtw'): + dis = ["euclidean", "dtw", "softdtw"] + dis_choice = st.sidebar.selectbox("Select Distance measure", dis) + mod_choice = TimeSeriesKMeans(n_clusters=n_clusters, metric=dis_choice, max_iter=iter, n_init=n_init, random_state=number).fit(X) + + elif (mod_choice == 'hierarchical'): + dis = ["euclidean", "manhattan", "cosine"] + dis_choice = st.sidebar.selectbox("Select Affinity measure", dis) + link = ['single', 'complete', 'average', 'ward'] + link_choice = st.sidebar.selectbox("Select Linkage", link) + mod_choice = AgglomerativeClustering(distance_threshold=None, n_clusters=n_clusters, affinity=dis_choice, + linkage=link_choice).fit(X) + if st.checkbox("Plot Dendrogram"): + #plots_title = ( 'The Dendrogram for %d' % link_choice ) + + if link_choice == 'single': + Z = linkage(X, 'single') + elif link_choice == 'complete': + Z = linkage(X, 'complete') + elif link_choice == 'average': + Z = linkage(X, 'average') + elif link_choice == 'ward': + Z = linkage(X, 'ward') + + fig = ff.create_dendrogram(Z) + #fig.update_layout(color_threshold=1.5) + st.plotly_chart(fig) + + elif (mod_choice == 'DBA'): + dis = ["euclidean", "dtw", "softdtw"] + dis_choice = st.sidebar.selectbox("Select Distance measure", dis) + mod_choice = TimeSeriesKMeans(n_clusters=n_clusters, n_init=n_init, metric=dis_choice, verbose=True, + max_iter_barycenter=iter, + random_state=number).fit(X) + + elif (mod_choice == 'KMedoids'): + dis = ["euclidean", "manhattan", "chebyshev", "canberra", "minkowski", "cosine"] + dis_choice = st.sidebar.selectbox("Select Distance measure", dis) + mod_choice = KMedoids(n_clusters=n_clusters, metric=dis_choice, method='pam', random_state=number).fit(X) + + silhouette_avrg = silhouette_score(X, mod_choice.labels_) + #print("Latest Silhouette ",round(silhouette_avrg,2), " average is.") + + df2 = pd.DataFrame(df2) + # Generate a darafreame along with metrics and their cluster labels + cl_df = pd.DataFrame(list(zip(df2.columns, mod_choice.labels_)), columns=['metric', 'cluster']) + + # Generating helper dictionaries and lists + cl_met = cl_df.groupby(['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict() + cl_count = cl_df['cluster'].value_counts().to_dict() + a_clust = [cluster for cluster in cl_count] + drp_clust = [cluster for cluster in cl_count if cl_count[cluster] < min_n] + list_clust = [cluster for cluster in cl_count if cl_count[cluster] >= min_n] + clst_final= np.array(list_clust) + + data2 = np.array([mod_choice.labels_]) + + d_cl_qua = {} + for clst_i in a_clust: + cr_x = df2[cl_met[clst_i]].corr().abs().values + cr_x_mean = round(cr_x[np.triu_indices(cr_x.shape[0], 1)].mean(), 2) + d_cl_qua[clst_i] = cr_x_mean + + # get quality score for each cluster + #if st.checkbox("View CVI's"): + + #clst_eqal_choice == 'sil' + silhouette_scores = silhouette_samples(X, mod_choice.labels_) + cl_df_sil = pd.DataFrame( + list(zip(mod_choice.labels_, silhouette_scores)), + columns=['cluster', 'silhouette_score']) + cl_df_sil = cl_df_sil.groupby(['cluster']).max() + # print("Cluster Sil ",cl_df_sil, " score.") + d_cl_qua = cl_df_sil.to_dict()['silhouette_score'] + # print("1. SIlhouette index ", silhouette_avrg, " is.") + #print("2. Latest Silhouette ", round(silhouette_avrg, 2), " average is.") + + # build cluster level df with some cluster metadata + cl_df_met = pd.DataFrame.from_dict(cl_count, orient='index', columns=['n']) + cl_df_met.index.names = ['cluster'] + #d_cl_qua = silhouette_avrg.to_dict()['silhouette_score'] + cl_df_met['quality_score'] = cl_df_met.index.map(d_cl_qua) + cl_df_met = cl_df_met.sort_values('quality_score', ascending=False) + + return df2, X, list_clust, mod_choice, cl_df + + +### Function to visualize all the clusters +def visualization_clusters(df2, mod_choice, list_clust): + # Create a figure to plot all clusters together + all_clusters_fig = go.Figure() + + clustered_data = df2.groupby(mod_choice.labels_, axis=1) + print("######################") + st.write(clustered_data) + + # Filter out clusters that are not in list_clust + clustered_data = {k: v for k, v in clustered_data if k in list_clust} + + # Function to generate random data points between two lines + def generate_random_between_lines(upper_line, lower_line, num_points=24, scale_factor=1.0): + random_data = np.random.uniform(lower_line, upper_line, num_points) * scale_factor + return random_data + + # Create lists to hold the mean, max, and min values for each cluster + mean_data = [] + upper_quartile = [] + lower_quartile = [] + + for cluster_id in list_clust: + mean_data.append(clustered_data[cluster_id].mean(axis=1)) + upper_quartile.append(clustered_data[cluster_id].quantile(0.75, axis=1)) + lower_quartile.append(clustered_data[cluster_id].quantile(0.25, axis=1)) + + # Sort the clusters in ascending order + sorted_clusters = sorted(zip(list_clust, mean_data, upper_quartile, lower_quartile), key=lambda x: x[0]) + mean_aggregated = np.mean([clustered_data[cluster_id].mean(axis=1) for cluster_id in list_clust], axis=0).tolist() + max_aggregated = np.max([clustered_data[cluster_id].quantile(0.75, axis=1) for cluster_id in list_clust], axis=0).tolist() + min_aggregated = np.min([clustered_data[cluster_id].quantile(0.25, axis=1) for cluster_id in list_clust], axis=0).tolist() + + all_clusters_fig = go.Figure() + for cluster_id in list_clust: + all_clusters_fig.add_trace(go.Scatter(x=df2.columns, y=clustered_data[cluster_id].mean(axis=1).tolist(), mode='lines', name=f'Cluster {cluster_id}')) + + all_clusters_fig.add_trace(go.Scatter(x=df2.columns, y=mean_aggregated, mode='lines', name='Mean', line=dict(color='black'))) + all_clusters_fig.add_trace(go.Scatter(x=df2.columns, y=max_aggregated, mode='lines', name='Max', line=dict(color='lightgray'))) + all_clusters_fig.add_trace(go.Scatter(x=df2.columns, y=min_aggregated, mode='lines', name='Min', fill='tonexty', line=dict(color='lightgray'))) + all_clusters_fig.update_layout(xaxis_title="Metrics", yaxis_title="Values", title="All Clusters along with mean, min, max lines", showlegend=True) + + st.plotly_chart(all_clusters_fig) + + # Function to handle button click + def on_button_click(): + for i, (cluster_id, cluster_mean, cluster_upper, cluster_lower) in enumerate(sorted_clusters): + fig = go.Figure() + + # Add original cluster line + fig.add_trace(go.Scatter(x=df2.columns, y=cluster_mean, mode='lines', name=f'Cluster {cluster_id}')) + # Original mean line + #mean = np.mean([mean_data[cluster_id] for cluster_id in list_clust], axis=0) + #fig.add_trace(go.Scatter(x=df2.columns, y=mean, mode='lines', name='Mean', line=dict(color='black'))) + + # Generate random data points between upper and lower bounds + new_max_data = generate_random_between_lines(cluster_upper, cluster_mean) + new_min_data = generate_random_between_lines(cluster_mean, cluster_lower) + + # Update the max and min curves + updated_max_line = np.mean([cluster_upper, new_max_data], axis=0) + updated_min_line = np.mean([cluster_lower, new_min_data], axis=0) + + # Update the figure with the new min and max curves + #fig = update_min_max_curves(fig, df2.columns, updated_max_line, updated_min_line, cluster_id) + #fig.add_trace(go.Scatter(x=df2.columns, y=updated_max_line, mode='lines', name='New_Max', line=dict(width=3))) + #fig.add_trace(go.Scatter(x=df2.columns, y=updated_min_line, mode='lines', name='New_Min', line=dict(width=3))) + + # Add aggregated max line + new_aggregated_max_line = np.mean([np.max(upper_quartile, axis=0), updated_max_line], axis=0) + fig.add_trace(go.Scatter(x=df2.columns, y=new_aggregated_max_line, mode='lines', name='Max', line=dict(width=3, dash='dash'))) + + # Add aggregated min line + new_aggregated_min_line = np.mean([np.min(lower_quartile, axis=0), updated_min_line], axis=0) + fig.add_trace(go.Scatter(x=df2.columns, y=new_aggregated_min_line, mode='lines', name='Min', fill='tonexty', line=dict(width=3, dash='dash'))) + + # Set layout for each subplot + plot_title = f"Cluster {cluster_id} with Random Values" + fig.update_layout( + #xaxis_title="Metrics", + yaxis_title="Total electricity consumption kW", + title=plot_title, + ) + + # Show the plot for each cluster separately + st.plotly_chart(fig, use_container_width=True) + + # Add a button to generate random numbers + if st.button('Generate Energy Scenario with random values'): + on_button_click() + + + +def plot_energy_consumption(data, combined=True): + # Create a DataFrame with the necessary columns + df1 = data[['Hour', 'Month', 'Consumption active power']] + + # Normalize the 'Consumption active power' between 0 and 1 + df1['Consumption active power'] = (df1['Consumption active power'] - df1['Consumption active power'].min()) / (df1['Consumption active power'].max() - df1['Consumption active power'].min()) + + # Create a pivot table where 'Hour' is the index, and 'Month' is the column + df1_pivot = pd.pivot_table(df1, values='Consumption active power', index='Hour', columns='Month') + + # Normalize the 'Consumption active power' column between 0 and 1 for each hour + df1_pivot = (df1_pivot - df1_pivot.min()) / (df1_pivot.max() - df1_pivot.min()) + + # Create a dictionary to map month numbers to their corresponding names + month_map = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', + 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'} + + if combined: + # Create a figure + fig = go.Figure() + + # Iterate over months (columns) and add traces to the figure + for month_num in df1_pivot.columns: + df_month = df1_pivot[month_num] + month_name = month_map[month_num] + fig.add_trace(go.Scatter(x=df_month.index, y=df_month, mode='lines', name=month_name)) + + # Update the layout + help_info = "*Double click on a month for which you want to peek*" + fig.update_layout(title='Energy Consumption by Hour for Different Months', + xaxis_title='Hour of the Day', + yaxis_title='Total Electricity Consumption (kW)') + + # Add an info icon + fig.add_annotation( + x=0.95, + y=0.95, + xref='paper', + yref='paper', + text='ℹï¸', + showarrow=False, + font=dict(size=24), + hovertext=help_info + ) + + # Display the plot using Streamlit + st.plotly_chart(fig) + + else: + # Create a subplot with 12 small plots + fig = make_subplots(rows=4, cols=3, subplot_titles=[month_map[i+1] for i in range(12)], shared_xaxes=True, shared_yaxes=True) + + # Iterate over months (columns) and add a subplot for each + for i, month_num in enumerate(df1_pivot.columns): + row = i // 3 + 1 + col = i % 3 + 1 + df_month = df1_pivot[month_num] + month_name = month_map[month_num] + fig.add_trace(go.Scatter(x=df_month.index, y=df_month, mode='lines', name=month_name), row=row, col=col) + + # Update the layout + fig.update_layout(height=800, width=1000, title_text='Energy Consumption by Hour for Each Month', showlegend=False) + st.plotly_chart(fig) + + + +def merge_and_create_dataframe(df, cl_df): + # Renaming the 'metric' column to 'Month' and adjusting the values + cl_df.rename(columns={'metric': 'Month'}, inplace=True) + cl_df['Month'] = cl_df['Month'] % 12 + 1 # Modifying the values to be from 1 to 12 + + # Merging the dataframes + df_merged = pd.merge(df, cl_df, on='Month', how='left') + + # Selecting only the required columns + df_merged = df_merged[['Month', 'Days', 'cluster']] + + return df_merged + +# Function to map numerical months to month names +def map_month(num): + months = { + 1: 'January', + 2: 'February', + 3: 'March', + 4: 'April', + 5: 'May', + 6: 'June', + 7: 'July', + 8: 'August', + 9: 'September', + 10: 'October', + 11: 'November', + 12: 'December' + } + return months[num] + + +def download_df(df_merged): +# IMPORTANT: Cache the conversion to prevent computation on every rerun + return df_merged.to_csv().encode('utf-8') + + + + +### main application +def app(): + st.sidebar.title('Energy Scenario Planning') + activities = ["VISUALIZE THE CLUSTERS", "EXPLORE YOURSELF"] + choice = st.sidebar.selectbox("Would you like to try yourself: Select other", activities) + + uploaded_file = st.file_uploader("Right now we only accept CSV file format", type="csv") + global df, df1 # Global declaration + + if uploaded_file is not None: + if choice == "VISUALIZE THE CLUSTERS": + df, df1 = time_series_analysis(uploaded_file, display_full=False) + df2, X, list_clust, mod_choice, cl_df = clustering(df1, n_clusters=5, number=10, mod_choice='Time Series K-Means', iter=15, n_init=2) + #print(df2) + # Plot with all 12 months combined + #plot_energy_consumption(df) + # Plot with each month separately + plot_energy_consumption(df, combined=False) + + visualization_clusters(df2, mod_choice, list_clust) + df_merged = merge_and_create_dataframe(df, cl_df) + #print(df_merged) + """ + # Prepare data for the grouped bar chart + new_bar_data = df_merged.copy() + new_bar_data.columns = ['Month', 'Days', 'Cluster'] + new_bar_data['Month'] = new_bar_data['Month'].apply(map_month) + + # Draw the grouped bar chart + fig = go.Figure() + for month in new_bar_data['Month'].unique(): + data = new_bar_data[new_bar_data['Month'] == month] + #fig.add_trace(go.Bar(x=data['Month'], y=data['Days'], name=month)) + fig.add_trace(go.Bar(x=data['Cluster'], y=data['Days'], name=month)) + + fig.update_layout(barmode='stack', title='Data Dispersion in Each Cluster', xaxis_title='clusters', yaxis_title='Days + energy(kW)') + st.plotly_chart(fig) + """ + # Step 1: Aggregating the data by 'Month', 'Days', and 'cluster' + aggregated_data = df_merged.groupby(['Month', 'Days', 'cluster']).size().unstack(fill_value=0) + + # Step 2: Resetting the index for the DataFrame to plot using Plotly + aggregated_data.reset_index(inplace=True) + + # Step 3: Melting the DataFrame for Plotly compatibility + aggregated_data_melted = aggregated_data.melt(id_vars=['Month', 'Days'], var_name='cluster', value_name='count') + + # Step 4: Create the stacked bar chart using Plotly + fig = px.bar( + aggregated_data_melted, + x="Month", # Month on the x-axis + y="count", # Count of clusters on the y-axis + color="cluster", # Different cluster numbers shown as different colors + facet_row="Days", # Facet rows for different days (optional, can remove if not needed) + labels={'Month': 'Month', 'count': 'Cluster Count', 'cluster': 'Cluster Number'}, + title="Stacked Bar Chart of Cluster Numbers by Day for Each Month", + color_discrete_sequence=px.colors.qualitative.Set1 + ) + + # Step 5: Update layout for stacked bars + fig.update_layout(barmode='stack') + + # Step 6: Display the plot in Streamlit + st.title('Cluster Data Aggregation and Visualization') + st.write("This chart shows the number of clusters for each day of the month across the year.") + st.plotly_chart(fig) + + #### Download icon for new generated data + csv = download_df(df_merged) + st.download_button( + label="Download new Scenario data as CSV", + data=csv, + file_name='new_df.csv', + mime='text/csv', + ) + + elif choice == "EXPLORE YOURSELF": + df, df1 = time_series_analysis(uploaded_file, display_full=True) + n_clusters, number, mod_choice, iter, n_init = choose_parameters() + df2, X, list_clust, mod_choice, cl_df = clustering(df1, n_clusters, number, mod_choice, iter, n_init) + visualization_clusters(df2, mod_choice, list_clust) + + #### Download icon for new generated data + df_merged = merge_and_create_dataframe(df, cl_df) + csv = download_df(df_merged) + st.download_button( + label="Download new Scenario data as CSV", + data=csv, + file_name='new_df.csv', + mime='text/csv', + ) + + + + st.markdown("***Developed in collaboration with Prof. Xavier Valdes, Krishna Kumar Oli, and Kishankumar Vaidya***") + + +if __name__ == '__main__': + app() -- GitLab