Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • ko11920/thesiscodeimplementation
1 result
Show changes
Commits on Source (2)
Processing file: /home/jvaldes/Desktop/krishna-thesis/thesiscodeimplementation/data/Autohaus 2.csv
PAM with Euclidean: Silhouette=0.429, DB=0.743, CH=25225.074
PAM with DTW: Silhouette=0.427, DB=0.742, CH=24788.624
PAM with SBD: Silhouette=0.429, DB=0.743, CH=25225.074
K-shape with Euclidean: Silhouette=0.505, DB=0.652, CH=27923.189
K-shape with DTW: Silhouette=0.505, DB=0.652, CH=27923.189
K-shape with SBD: Silhouette=0.505, DB=0.652, CH=27923.189
DBA with Euclidean: Silhouette=0.509, DB=0.646, CH=28589.939
Error with DBA and DTW: Incorrect metric: <function dtw at 0x7f3cfafdac10> (should be one of 'dtw', 'softdtw', 'euclidean')
Error with DBA and SBD: Incorrect metric: <function sbd_distance at 0x7f3dffd4f040> (should be one of 'dtw', 'softdtw', 'euclidean')
Single with Euclidean: Silhouette=0.805, DB=0.143, CH=205.795
Single with DTW: Silhouette=0.805, DB=0.143, CH=205.795
Single with SBD: Silhouette=0.805, DB=0.143, CH=205.795
Average with Euclidean: Silhouette=0.823, DB=0.369, CH=1220.481
Average with DTW: Silhouette=0.823, DB=0.369, CH=1220.481
Average with SBD: Silhouette=0.823, DB=0.369, CH=1220.481
Complete with Euclidean: Silhouette=0.659, DB=0.482, CH=1881.441
Complete with DTW: Silhouette=0.659, DB=0.482, CH=1881.441
Complete with SBD: Silhouette=0.659, DB=0.482, CH=1881.441
Ward with Euclidean: Silhouette=0.486, DB=0.633, CH=25566.485
Ward with DTW: Silhouette=0.486, DB=0.633, CH=25566.485
Ward with SBD: Silhouette=0.486, DB=0.633, CH=25566.485
Error with Centroid and Euclidean: Unknown linkage type centroid. Valid options are dict_keys(['ward', 'complete', 'average', 'single'])
Error with Centroid and DTW: Unknown linkage type centroid. Valid options are dict_keys(['ward', 'complete', 'average', 'single'])
Error with Centroid and SBD: Unknown linkage type centroid. Valid options are dict_keys(['ward', 'complete', 'average', 'single'])
Error with Median and Euclidean: Unknown linkage type median. Valid options are dict_keys(['ward', 'complete', 'average', 'single'])
Error with Median and DTW: Unknown linkage type median. Valid options are dict_keys(['ward', 'complete', 'average', 'single'])
Error with Median and SBD: Unknown linkage type median. Valid options are dict_keys(['ward', 'complete', 'average', 'single'])
\ No newline at end of file
##### Imports #####
import os
from re import S
import math
import streamlit as st
import matplotlib as plt
import pandas as pd
import textwrap
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
import numba
import numpy as np
import seaborn as sns
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import colors
import matplotlib.pyplot as plt
import altair as alt
##### Plotly ######
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.graph_objects as go
import chart_studio
from plotly import tools
from plotly.subplots import make_subplots
import time #from datetime import datetime
import datetime
from scipy.fftpack import rfft
from scipy.stats import boxcox
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import calinski_harabasz_score
from scipy.cluster.hierarchy import single, complete, average, ward, dendrogram, linkage
from sklearn.metrics.cluster import contingency_matrix
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from datetime import timedelta #from datetime import datetime
# Algorithms
from tslearn.barycenters import dtw_barycenter_averaging
from tslearn.clustering import TimeSeriesKMeans
from sktime.distances import dtw_distance
from dtaidistance import clustering, dtw
#from fcmeans import FCM
# IMplementation for pyclustering kmeans
from pyclustering.cluster.kmeans import kmeans
from pyclustering.cluster.center_initializer import random_center_initializer
from pyclustering.cluster.encoder import type_encoding
from pyclustering.cluster.encoder import cluster_encoder
from pyclustering.utils.metric import distance_metric
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.fcm import fcm
from sklearn.metrics import pairwise_distances
from validclust import cop, dunn
# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.cluster import contingency_matrix
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from netdata_pandas.data import get_data, get_chart_list
from am4894plots.plots import plot_lines, plot_lines_grid
from matplotlib.patches import Ellipse
from sklearn import preprocessing
#from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer
from sklearn.model_selection import train_test_split
####### DEF for null values #####
def null_values(df):
null_test = (df.isnull().sum(axis=0) / len(df)).sort_values(ascending=False).index
null_data_test = pd.concat([
df.isnull().sum(axis=0),
(df.isnull().sum(axis=0) / len(df)).sort_values(ascending=False),
df.loc[:, df.columns.isin(list(null_test))].dtypes], axis=1)
null_data_test = null_data_test.rename(columns={0: '# null',
1: '% null',
2: 'type'}).sort_values(ascending=False, by='% null')
null_data_test = null_data_test[null_data_test["# null"] != 0]
return null_data_test
def type(df):
return pd.DataFrame(df.dtypes, columns=['Type'])
def preprocessing_meanvar(df):
X = TimeSeriesScalerMeanVariance().fit_transform(X)
df = pd.DataFrame(X.reshape(df.shape), columns=df.columns, index=df.index)
return df
def purity_score(y_true, y_pred):
# compute contingency matrix (also called confusion matrix)
confusion_matrix = contingency_matrix(y_true, y_pred)
# return purity
return np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix)
## 1. Exploratory Data Analysis
def exploratory_data_analysis(data):
"""
Perform exploratory data analysis on a dataset with 15-minute time intervals.
Args:
data (pd.DataFrame): Input dataset with a datetime column named 'Time'.
Returns:
pd.DataFrame: Processed DataFrame with additional features.
"""
# Ensure 'Time' column is in datetime format
data['Time'] = pd.to_datetime(data['Time'])
# Set 'Time' column as the index
data.set_index('Time', inplace=True)
# Extract temporal features
data['Hour'] = data.index.hour
data['Minute'] = data.index.minute
data['Day'] = data.index.day
data['Weekday'] = data.index.weekday # 0=Monday, 6=Sunday
data['Month'] = data.index.month
data['DayOfYear'] = data.index.dayofyear
# Check for missing values and impute with column mean
imputer = SimpleImputer(strategy="mean")
data.iloc[:, :] = imputer.fit_transform(data)
# Display summary statistics
print(data.describe())
return data
## preprocess for clustering
## Preprocess for clustering
def preprocess_for_clustering(data):
"""
Preprocess the data for clustering by aggregating it to a daily level.
Args:
data (pd.DataFrame): Input DataFrame containing the time-series data.
Returns:
pd.DataFrame: Preprocessed DataFrame aggregated at the daily level, ready for clustering.
"""
# Load the dataset into a DataFrame
df = pd.DataFrame(data)
# Ensure the 'Time' column is in datetime format
df['Time'] = pd.to_datetime(df['Time'])
# Check for missing values and impute numerical columns with column mean
numerical_columns = df.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy="mean")
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
# Extract the date component (day-level aggregation)
df['Date'] = df['Time'].dt.date
# Aggregate data by day
daily_aggregates = df.groupby('Date').agg({
'ess active power': ['mean', 'sum', 'max', 'min'],
'grid active power': ['mean', 'sum', 'max', 'min'],
'Consumption active power': ['mean', 'sum', 'max', 'min'],
'Production active power': ['mean', 'sum', 'max', 'min']
})
# Flatten the multi-level column index
daily_aggregates.columns = ['_'.join(col).strip() for col in daily_aggregates.columns]
# Reset index to make 'Date' a column
daily_aggregates.reset_index(inplace=True)
return daily_aggregates
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
def process_daily_mean_power(df):
# Convert the 'Time' column to datetime
df['Time'] = pd.to_datetime(df['Time'])
# Set the 'Time' column as the index
df.set_index('Time', inplace=True)
# Resample the data to get the mean daily value
daily_mean = df.resample('D').mean()
# Reset the index to make 'Time' a column again
daily_mean = daily_mean.reset_index()
# Rename 'Time' to 'Date'
#daily_mean.rename(columns={'Time': 'Date'}, inplace=True)
# Drop rows with NaN values
daily_mean.dropna(inplace=True)
return daily_mean
def plot_daily_mean_power(daily_mean):
"""
Plots the daily mean power values.
Parameters:
daily_mean (pandas.DataFrame): DataFrame containing daily mean values.
"""
plt.figure(figsize=(10, 6))
for column in ['ess active power', 'grid active power', 'Consumption active power', 'Production active power']:
plt.plot(daily_mean['Time'], daily_mean[column], label=column)
plt.xlabel('Date')
plt.ylabel('Mean Daily Value')
plt.title('Mean Daily Power Values')
plt.legend()
plt.grid(True)
# Format x-axis to display dates
ax = plt.gca()
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
ax.xaxis.set_major_locator(mdates.DayLocator())
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Placeholder SBD distance function
def sbd_distance(x, y):
# Replace this with the actual SBD implementation if available
return np.linalg.norm(x - y)
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from tslearn.clustering import KShape, TimeSeriesKMeans
from tslearn.metrics import dtw
from sklearn.cluster import AgglomerativeClustering
from aeon.distances import sbd_distance, sbd_pairwise_distance
# Compute additional indices (Dunn, COP, Score Function)
def compute_custom_indices(data, labels):
"""
Compute Dunn Index, COP Index, and Score Function for clustering validation.
Args:
data (np.ndarray): Feature data used for clustering.
labels (np.ndarray): Cluster labels.
Returns:
tuple: (Dunn Index, COP Index, Score Function)
"""
# Unique clusters
unique_labels = np.unique(labels)
clusters = [data[labels == k] for k in unique_labels]
# Pairwise distances
def pairwise_distances(cluster):
n = cluster.shape[0]
return np.linalg.norm(cluster[:, None, :] - cluster[None, :, :], axis=-1).flatten()
# Dunn Index
inter_cluster_distances = []
intra_cluster_distances = []
for i, cluster_i in enumerate(clusters):
# Intra-cluster distance
if len(cluster_i) > 1:
intra_cluster_distances.append(pairwise_distances(cluster_i).max())
else:
intra_cluster_distances.append(0)
# Inter-cluster distances
for j, cluster_j in enumerate(clusters):
if i != j:
dist = np.linalg.norm(
cluster_i[:, None, :] - cluster_j[None, :, :], axis=-1
).min()
inter_cluster_distances.append(dist)
dunn_index = min(inter_cluster_distances) / max(intra_cluster_distances)
# COP Index
inter_dists = []
intra_dists = []
for cluster in clusters:
if len(cluster) > 1:
intra_dists.append(pairwise_distances(cluster).mean())
else:
intra_dists.append(0)
for i, cluster_i in enumerate(clusters):
for j, cluster_j in enumerate(clusters):
if i != j:
inter_dists.append(
np.linalg.norm(
cluster_i[:, None, :] - cluster_j[None, :, :], axis=-1
).mean()
)
cop_index = sum(intra_dists) / sum(inter_dists)
# Score Function (SF) Index
score_function = (len(unique_labels) / len(data)) * (np.mean(intra_cluster_distances) / np.mean(inter_cluster_distances))
return dunn_index, cop_index, score_function
def evaluate_clustering(labels, data, output_file="clustering_cvi.csv"):
"""
Evaluate clustering performance with various validity indices and save them to a CSV file.
Args:
labels (np.ndarray): Cluster labels.
data (np.ndarray): Data used for clustering.
output_file (str): Path to save the CSV file with cluster validity indices.
Returns:
dict: Dictionary of validity indices.
"""
# Standard indices
sil = silhouette_score(data, labels, metric="euclidean")
db = davies_bouldin_score(data, labels)
ch = calinski_harabasz_score(data, labels)
# Custom indices
dunn, cop, sf = compute_custom_indices(data, labels)
# Store indices in a dictionary
cvi_results = {
"Silhouette Score": sil,
"Davies-Bouldin Score": db,
"Calinski-Harabasz Score": ch,
"Dunn Index": dunn,
"COP Index": cop,
"Score Function": sf,
}
# Save results to a CSV file
pd.DataFrame([cvi_results]).to_csv(output_file, index=False)
print(f"Cluster validity indices saved to {output_file}")
return cvi_results
# Clustering function
def cluster_daily_aggregates(data, clustering_method="K-shape", distance_metric="Euclidean", n_clusters=3):
"""
Cluster daily aggregates using various methods and evaluate clustering performance.
Args:
data (pd.DataFrame): Preprocessed daily aggregate DataFrame.
clustering_method (str): The clustering method to use.
distance_metric (str): The distance metric to use.
n_clusters (int): The number of clusters.
Returns:
dict: Dictionary with clustering labels, method details, and evaluation metrics.
"""
# Prepare data for clustering (drop the date column)
features = data.drop(columns=["Time"]).values
# Normalize features for clustering
features = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
# Initialize model based on method and distance
if clustering_method == "PAM":
if distance_metric not in ["Euclidean", "DTW"]:
raise ValueError("PAM supports only Euclidean and DTW.")
model = KMedoids(n_clusters=n_clusters, metric=distance_metric.lower())
elif clustering_method == "K-shape":
#if distance_metric != "SBD":
# raise ValueError("K-shape requires SBD as the distance metric.")
model = KShape(n_clusters=n_clusters)
elif clustering_method == "DBA":
if distance_metric != "DTW":
raise ValueError("DBA requires DTW as the distance metric.")
model = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", method="dba")
elif clustering_method in ["Single", "Average", "Complete", "Ward", "Centroid", "Median"]:
linkage = clustering_method.lower()
affinity = distance_metric.lower()
if affinity not in ["euclidean", "dtw", "sbd"]:
raise ValueError("Hierarchical supports only Euclidean, DTW, and SBD.")
model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage, affinity="precomputed")
# Compute pairwise distance matrix
distance_func = {"euclidean": np.linalg.norm, "dtw": dtw, "sbd": sbd_pairwise_distance}[affinity]
distance_matrix = np.array([[distance_func(x, y) for y in features] for x in features])
labels = model.fit_predict(distance_matrix)
else:
raise ValueError(f"Invalid clustering method: {clustering_method}")
# Fit the model and get labels
if clustering_method not in ["Single", "Average", "Complete", "Ward", "Centroid", "Median"]:
labels = model.fit_predict(features)
# Evaluate clustering with indices
indices = evaluate_clustering(labels, features)
return {
"labels": labels,
**indices,
"method": clustering_method,
"metric": distance_metric,
}
import matplotlib.pyplot as plt
import seaborn as sns
def visualize_clusters(daily_aggregates, labels):
"""
Visualize clustering results on the daily aggregates for all features.
Args:
daily_aggregates (pd.DataFrame): Preprocessed daily aggregates data.
labels (array-like): Cluster labels for each day.
"""
# Add cluster labels to the daily aggregates DataFrame
daily_aggregates['Cluster'] = labels
# Features to visualize
features_to_plot = [
'ess active power',
'grid active power',
'Consumption active power',
'Production active power'
]
plt.figure(figsize=(16, 8))
for idx, feature in enumerate(features_to_plot, 1):
plt.subplot(2, 2, idx)
sns.scatterplot(
x=daily_aggregates['Time'],
y=daily_aggregates[feature],
hue=daily_aggregates['Cluster'],
palette="viridis",
s=100,
edgecolor="k",
)
plt.title(f'Cluster Visualization for {feature}', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel(feature, fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
def visualize_clusters_over_time(daily_aggregates, labels):
"""
Visualize clustering results over time for each feature.
Args:
daily_aggregates (pd.DataFrame): Preprocessed daily aggregates data.
labels (array-like): Cluster labels for each day.
"""
# Add cluster labels to the daily aggregates DataFrame
daily_aggregates['Cluster'] = labels
# Features to visualize
features_to_plot = [
'ess active power',
'grid active power',
'Consumption active power',
'Production active power'
]
plt.figure(figsize=(16, 12))
for idx, feature in enumerate(features_to_plot, 1):
plt.subplot(2, 2, idx)
for cluster in sorted(daily_aggregates['Cluster'].unique()):
cluster_data = daily_aggregates[daily_aggregates['Cluster'] == cluster]
plt.plot(cluster_data['Time'], cluster_data[feature], label=f'Cluster {cluster}', linewidth=2)
plt.title(f'Cluster Visualization for {feature}', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel(feature, fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, linestyle="--", alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Visualize clustering results using a heatmap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
# Visualize clustering results using a heatmap
def visualize_clusters_heatmap(daily_aggregates, labels):
"""
Visualize clustering results using a heatmap with months on the x-axis, days on the y-axis, and distinct colors for each cluster.
Args:
daily_aggregates (pd.DataFrame): Preprocessed daily aggregates data.
labels (array-like): Cluster labels for each day.
"""
# Increment cluster labels by 1 to start from 1 instead of 0
labels = labels + 1
# Add cluster labels to the daily aggregates DataFrame
daily_aggregates['Cluster'] = labels
# Extract month and day from the 'Time' column
daily_aggregates['Month'] = daily_aggregates['Time'].dt.strftime('%b') # Abbreviated month name (e.g., Jan, Feb)
daily_aggregates['Day'] = daily_aggregates['Time'].dt.day # Day of the month (1 to 31)
# Pivot the data for heatmap: Months on x-axis, Days on y-axis, and Clusters as values
heatmap_data = daily_aggregates.pivot_table(index='Day', columns='Month', values='Cluster', aggfunc='first')
# Ensure all days (1 to 31) are included in the heatmap and in ascending order
heatmap_data = heatmap_data.reindex(index=range(1, 32), columns=pd.date_range(start='2023-01-01', periods=12, freq='M').strftime('%b'))
# Create a categorical color palette for clusters
unique_clusters = np.unique(labels)
cluster_palette = sns.color_palette("husl", n_colors=len(unique_clusters)) # Use a distinct color for each cluster
cluster_cmap = ListedColormap(cluster_palette)
# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(
heatmap_data,
cmap=cluster_cmap,
annot=False,
linewidths=0.5,
cbar_kws={'ticks': unique_clusters, 'label': 'Cluster'}
)
plt.title('Cluster Distribution by Month and Day', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Day of Month', fontsize=14)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
# Visualize clustering results by hour of the day
def visualize_clusters_by_hour(daily_aggregates, labels):
"""
Visualize clustering results by hour of the day with normalized data.
Args:
daily_aggregates (pd.DataFrame): Preprocessed daily aggregates data.
labels (array-like): Cluster labels for each day.
"""
# Add cluster labels to the daily aggregates DataFrame
daily_aggregates['Cluster'] = labels
# Extract the hour from the 'Time' column
daily_aggregates['Hour'] = daily_aggregates['Time'].dt.hour
# Features to visualize
features_to_plot = [
'ess active power',
'grid active power',
'Consumption active power',
'Production active power'
]
# Normalize the features
scaler = MinMaxScaler()
daily_aggregates[features_to_plot] = scaler.fit_transform(daily_aggregates[features_to_plot])
plt.figure(figsize=(16, 12))
for idx, feature in enumerate(features_to_plot, 1):
plt.subplot(2, 2, idx)
sns.boxplot(
x='Hour',
y=feature,
hue='Cluster',
data=daily_aggregates,
palette="viridis"
)
plt.title(f'Cluster Visualization for {feature} by Hour', fontsize=14)
plt.xlabel('Hour of the Day', fontsize=12)
plt.ylabel(feature, fontsize=12)
plt.grid(True, linestyle="--", alpha=0.7)
plt.legend(title='Cluster')
# Set x-axis ticks to include all 24 hours
plt.xticks(ticks=range(0, 24), labels=[f"{i}:00" for i in range(24)], rotation=45)
plt.tight_layout()
plt.show()
# Visualize features using heatmaps
def visualize_features_heatmap(daily_aggregates):
"""
Visualize features using heatmaps with months on the x-axis, days on the y-axis (ascending from bottom to top),
and actual feature values as colors.
Args:
daily_aggregates (pd.DataFrame): Preprocessed daily aggregates data.
"""
# Extract month and day from the 'Time' column
daily_aggregates['Month'] = daily_aggregates['Time'].dt.strftime('%b') # Abbreviated month name (e.g., Jan, Feb)
daily_aggregates['Day'] = daily_aggregates['Time'].dt.day # Day of the month (1 to 31)
# Features to visualize
features_to_plot = [
'ess active power',
'grid active power',
'Consumption active power',
'Production active power'
]
# Initialize MinMaxScaler for normalization
scaler = MinMaxScaler()
# Plot heatmaps for each feature
plt.figure(figsize=(18, 12))
for idx, feature in enumerate(features_to_plot, 1):
plt.subplot(2, 2, idx)
# Pivot the data for heatmap: Months on x-axis, Days on y-axis, and Feature values as colors
heatmap_data = daily_aggregates.pivot_table(index='Day', columns='Month', values=feature, aggfunc='mean')
# Ensure all days (1 to 31) are included in the heatmap and in ascending order
heatmap_data = heatmap_data.reindex(index=range(1, 32), columns=pd.date_range(start='2023-01-01', periods=12, freq='M').strftime('%b'))
# Normalize the feature values using MinMaxScaler
heatmap_data_normalized = pd.DataFrame(scaler.fit_transform(heatmap_data), columns=heatmap_data.columns, index=heatmap_data.index)
# Plot the heatmap
sns.heatmap(
heatmap_data_normalized,
cmap="viridis", # Use a gradient colormap for feature values
annot=False, # No annotations
linewidths=0.5,
cbar_kws={'label': f'Normalized {feature}'}
)
plt.title(f'{feature} by Month and Day', fontsize=14)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Day of Month', fontsize=12)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
# Reverse the y-axis to show days in ascending order from bottom to top
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
# Example Usage
if __name__ == "__main__":
# Step 1: Load the dataset
data = pd.read_csv("/home/jvaldes/Desktop/krishna-thesis/thesiscodeimplementation/data/EFH (1).csv") # Replace with your dataset file
# Process the data to get daily mean values
daily_mean_df = process_daily_mean_power(data)
# Cluster the data
results = cluster_daily_aggregates(
data=daily_mean_df,
clustering_method="PAM", # Choose K-shape
distance_metric="Euclidean", # Use "dtw" as the metric string
n_clusters=5
)
# Visualize the clusters
visualize_clusters(daily_mean_df, results['labels'])
# Visualize the clusters over time
visualize_clusters_heatmap(daily_mean_df, results['labels'])
# Visualize the clusters by hour of the day
visualize_clusters_by_hour(daily_mean_df, results['labels'])
# Visualize the features using heatmaps
visualize_features_heatmap(daily_mean_df)
Silhouette Score,Davies-Bouldin Score,Calinski-Harabasz Score,Dunn Index,COP Index,Score Function
0.2669572418330512,1.0653985796636556,120.65075634844322,0.005282282074285898,0.12360795558737725,0.1782706914446735
File,Method,Metric,Silhouette,Davies-Bouldin,Calinski-Harabasz
Algorithm,n_clusters,metric,Silhouette Score,Davies-Bouldin Score,Calinski-Harabasz Score,Dunn Index,COP Index,Score Function,distance_metric,linkage
PAM,3,euclidean,0.6084735745604813,0.5255388718350726,1573.8083266201857,0.024080739069240218,0.12466666158798587,0.020374527799168107,,
PAM,3,dtw,0.607357163367625,0.5279114718277387,1566.5655837795136,0.0256611410903551,0.12394902123867019,0.018476344437076984,,
PAM,5,euclidean,0.5130716869414388,0.6000071474870469,1671.6513431894605,0.02382786144635281,0.04898240277034309,0.014865213685815603,,
PAM,5,dtw,0.45097656449087653,0.7511981091089753,1199.6940913855497,0.008158928977945934,0.051837188725915216,0.016777185398622014,,
PAM,10,euclidean,0.40541899139937965,0.832945215289539,1346.9768732861298,0.012241836667342998,0.017688312074442908,0.0176282602403863,,
PAM,10,dtw,0.4142238999219652,0.7443833865684608,1328.9120284602031,0.012241836667342998,0.015978489422017256,0.015251591706191907,,
PAM,15,euclidean,0.39728780680452075,1.0267532426318489,879.3026125761872,0.012241836667342998,0.01081747720071677,0.02418145130736666,,
PAM,15,dtw,0.44316432015694907,0.7124307990760144,1609.4539126425957,0.03419163142941111,0.007599369876697907,0.014427271694567274,,
PAM,25,euclidean,0.39883340664138783,0.7061861709147516,743.9186095728879,0.024274880313737236,0.006667104504980991,0.02439578597502907,,
PAM,25,dtw,0.41030169965606983,0.8153478685847727,1192.925391489275,0.034945030212473846,0.0039025469711751284,0.019213748605672862,,
KShape,3,,0.6160397975833883,0.5086098610813464,1422.9985136673556,0.03685808013313155,0.12343003077286399,0.01906651031821008,,
KShape,5,,0.4767574584701706,0.572965049258707,1256.2560131989646,0.012847192222398445,0.040502225432802685,0.010338159516200476,,
KShape,10,,0.4642589675222783,0.7805246344695924,1019.7914746414634,0.033470661636082304,0.017669291409334032,0.017746937758402505,,
KShape,15,,0.39627824824565594,0.7539017550565398,2234.038751219863,0.02131106543896832,0.006569568014584026,0.012443260594478018,,
KShape,25,,0.3111002115907659,0.8989652432496515,2205.5219477546093,0.023962476515012668,0.0025699639977045255,0.011569965944323208,,
DBA,3,,0.6134224523646222,0.5174393596186243,1589.636201077178,0.04885213490256127,0.12505532967955998,0.01871504588601016,,
DBA,5,,0.5262338608445839,0.5882668471260979,2032.8201046534882,0.03630017951329181,0.04629526874704735,0.012398146565969744,,
DBA,10,,0.4594687018040139,0.6455833863810562,2760.8281676750903,0.02562350557364855,0.012119582182651974,0.010178741431244613,,
DBA,15,,0.41779240351557945,0.7451002406151804,3155.2344101063713,0.05475809617253117,0.006249220523684118,0.010834723447843028,,
DBA,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,,
Single,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,single
Single,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,single
Single,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,single
Single,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,single
Single,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,single
Single,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,single
Single,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,single
Single,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,single
Single,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,single
Single,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,single
Single,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,single
Single,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,single
Single,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,single
Single,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,single
Single,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,single
Average,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,average
Average,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,average
Average,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,average
Average,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,average
Average,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,average
Average,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,average
Average,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,average
Average,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,average
Average,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,average
Average,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,average
Average,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,average
Average,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,average
Average,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,average
Average,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,average
Average,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,average
Complete,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,complete
Complete,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,complete
Complete,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,complete
Complete,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,complete
Complete,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,complete
Complete,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,complete
Complete,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,complete
Complete,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,complete
Complete,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,complete
Complete,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,complete
Complete,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,complete
Complete,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,complete
Complete,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,complete
Complete,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,complete
Complete,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,complete
Centroid,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,centroid
Centroid,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,centroid
Centroid,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,centroid
Centroid,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,centroid
Centroid,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,centroid
Centroid,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,centroid
Centroid,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,centroid
Centroid,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,centroid
Centroid,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,centroid
Centroid,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,centroid
Centroid,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,centroid
Centroid,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,centroid
Centroid,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,centroid
Centroid,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,centroid
Centroid,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,centroid
Median,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,median
Median,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,median
Median,3,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,median
Median,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,median
Median,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,median
Median,5,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,median
Median,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,median
Median,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,median
Median,10,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,median
Median,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,median
Median,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,median
Median,15,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,median
Median,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,euclidean,median
Median,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,dtw,median
Median,25,,0.40088059138811455,0.7555800552965729,2998.728365916879,0.09754460177447387,0.002623820677912112,0.011268714247072832,sbd,median
Source diff could not be displayed: it is too large. Options to address this: view the blob.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import matplotlib.dates as mdates
from sklearn.impute import SimpleImputer
## 1. Exploratory Data Analysis
def exploratory_data_analysis(data):
"""
Perform exploratory data analysis on a dataset with 15-minute time intervals.
Args:
data (pd.DataFrame): Input dataset with a datetime column named 'Time'.
Returns:
pd.DataFrame: Processed DataFrame with additional features.
"""
# Ensure 'Time' column is in datetime format
data['Time'] = pd.to_datetime(data['Time'])
# Set 'Time' column as the index
data.set_index('Time', inplace=True)
# Extract temporal features
data['Hour'] = data.index.hour
data['Minute'] = data.index.minute
data['Day'] = data.index.day
data['Weekday'] = data.index.weekday # 0=Monday, 6=Sunday
data['Month'] = data.index.month
data['DayOfYear'] = data.index.dayofyear
# Check for missing values and impute with column mean
imputer = SimpleImputer(strategy="mean")
data.iloc[:, :] = imputer.fit_transform(data)
# Display summary statistics
return data
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def plot_daily_mean_power(data):
# Load the dataset into a DataFrame
df = pd.DataFrame(data)
# Ensure the 'Time' column is in datetime format
df['Time'] = pd.to_datetime(df['Time'])
# Set 'Time' as the index
df.set_index('Time', inplace=True)
# Check for missing values and impute numerical columns with column mean
numerical_columns = df.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy="mean")
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
# Resample the data to get the mean daily value
daily_mean = df.resample('D').mean()
# Reset the index to make 'Time' a column again
daily_mean = daily_mean.reset_index()
# Reshape the data to long format
daily_mean_long = daily_mean.melt(id_vars='Time', var_name='Power_Type', value_name='Mean_Value')
# Set seaborn style
sns.set(style="ticks")
# Plot the data
plt.figure(figsize=(14, 7))
sns.lineplot(data=daily_mean_long, x='Time', y='Mean_Value', hue='Power_Type', linewidth=2.5)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Mean Daily Value', fontsize=14)
plt.title('Mean Daily Power Values', fontsize=16)
plt.legend(title='Power Type', fontsize=12, title_fontsize=12)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
# Improve date formatting on x-axis
plt.gca().xaxis.set_major_locator(plt.MaxNLocator(10))
plt.gcf().autofmt_xdate()
plt.tight_layout() # Adjust layout to prevent label overlap
plt.show()
# Main Program to run the clustering
if __name__ == "__main__":
# Load the dataset
data = pd.read_csv("/home/jvaldes/Desktop/krishna-thesis/thesiscodeimplementation/data/Autohaus 2.csv") # Replace with your dataset file
#df = exploratory_data_analysis(data)
plot_daily_mean_power(data)
##### Imports #####
import os
from re import S
import math
import streamlit as st
import matplotlib as plt
import pandas as pd
import textwrap
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
import numba
import numpy as np
import seaborn as sns
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import colors
import matplotlib.pyplot as plt
import altair as alt
##### Plotly ######
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.graph_objects as go
import chart_studio
from plotly import tools
from plotly.subplots import make_subplots
import time #from datetime import datetime
import datetime
from scipy.fftpack import rfft
from scipy.stats import boxcox
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import calinski_harabasz_score
from scipy.cluster.hierarchy import single, complete, average, ward, dendrogram, linkage
from sklearn.metrics.cluster import contingency_matrix
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from datetime import timedelta #from datetime import datetime
# Algorithms
from tslearn.barycenters import dtw_barycenter_averaging
from tslearn.clustering import TimeSeriesKMeans
from sktime.distances import dtw_distance
from dtaidistance import clustering, dtw
#from fcmeans import FCM
# IMplementation for pyclustering kmeans
from pyclustering.cluster.kmeans import kmeans
from pyclustering.cluster.center_initializer import random_center_initializer
from pyclustering.cluster.encoder import type_encoding
from pyclustering.cluster.encoder import cluster_encoder
from pyclustering.utils.metric import distance_metric
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.fcm import fcm
from sklearn.metrics import pairwise_distances
from validclust import cop, dunn
# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.cluster import contingency_matrix
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from netdata_pandas.data import get_data, get_chart_list
from am4894plots.plots import plot_lines, plot_lines_grid
from matplotlib.patches import Ellipse
from sklearn import preprocessing
#from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from tslearn.clustering import KShape, TimeSeriesKMeans
from tslearn.metrics import dtw
from sklearn.cluster import AgglomerativeClustering
from aeon.distances import sbd_distance, sbd_pairwise_distance
import itertools
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
#from tslearn.clustering import KMedoids, KShape, TimeSeriesKMeans
####### DEF for null values #####
def null_values(df):
null_test = (df.isnull().sum(axis=0) / len(df)).sort_values(ascending=False).index
null_data_test = pd.concat([
df.isnull().sum(axis=0),
(df.isnull().sum(axis=0) / len(df)).sort_values(ascending=False),
df.loc[:, df.columns.isin(list(null_test))].dtypes], axis=1)
null_data_test = null_data_test.rename(columns={0: '# null',
1: '% null',
2: 'type'}).sort_values(ascending=False, by='% null')
null_data_test = null_data_test[null_data_test["# null"] != 0]
return null_data_test
def type(df):
return pd.DataFrame(df.dtypes, columns=['Type'])
def preprocessing_meanvar(df):
X = TimeSeriesScalerMeanVariance().fit_transform(X)
df = pd.DataFrame(X.reshape(df.shape), columns=df.columns, index=df.index)
return df
def purity_score(y_true, y_pred):
# compute contingency matrix (also called confusion matrix)
confusion_matrix = contingency_matrix(y_true, y_pred)
# return purity
return np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix)
## 1. Exploratory Data Analysis
def exploratory_data_analysis(data):
"""
Perform exploratory data analysis on a dataset with 15-minute time intervals.
Args:
data (pd.DataFrame): Input dataset with a datetime column named 'Time'.
Returns:
pd.DataFrame: Processed DataFrame with additional features.
"""
# Ensure 'Time' column is in datetime format
data['Time'] = pd.to_datetime(data['Time'])
# Set 'Time' column as the index
data.set_index('Time', inplace=True)
# Extract temporal features
data['Hour'] = data.index.hour
data['Minute'] = data.index.minute
data['Day'] = data.index.day
data['Weekday'] = data.index.weekday # 0=Monday, 6=Sunday
data['Month'] = data.index.month
data['DayOfYear'] = data.index.dayofyear
# Check for missing values and impute with column mean
imputer = SimpleImputer(strategy="mean")
data.iloc[:, :] = imputer.fit_transform(data)
# Display summary statistics
print(data.describe())
return data
## Preprocess for clustering
def preprocess_for_clustering(data):
"""
Preprocess the data for clustering by aggregating it to a daily level.
Args:
data (pd.DataFrame): Input DataFrame containing the time-series data.
Returns:
pd.DataFrame: Preprocessed DataFrame aggregated at the daily level, ready for clustering.
"""
# Load the dataset into a DataFrame
df = pd.DataFrame(data)
# Ensure the 'Time' column is in datetime format
df['Time'] = pd.to_datetime(df['Time'])
# Check for missing values and impute numerical columns with column mean
numerical_columns = df.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy="mean")
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
# Extract the date component (day-level aggregation)
df['Date'] = df['Time'].dt.date
# Aggregate data by day
daily_aggregates = df.groupby('Date').agg({
'ess active power': ['mean', 'sum', 'max', 'min'],
'grid active power': ['mean', 'sum', 'max', 'min'],
'Consumption active power': ['mean', 'sum', 'max', 'min'],
'Production active power': ['mean', 'sum', 'max', 'min']
})
# Flatten the multi-level column index
daily_aggregates.columns = ['_'.join(col).strip() for col in daily_aggregates.columns]
# Reset index to make 'Date' a column
daily_aggregates.reset_index(inplace=True)
return daily_aggregates
def plot_daily_mean_power(data):
# Load the dataset into a DataFrame
df = pd.DataFrame(data)
# Ensure the 'Time' column is in datetime format
df['Time'] = pd.to_datetime(df['Time'])
# Set 'Time' as the index
df.set_index('Time', inplace=True)
# Check for missing values and impute numerical columns with column mean
numerical_columns = df.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy="mean")
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
# Resample the data to get the mean daily value
daily_mean = df.resample('D').mean()
# Reset the index to make 'Time' a column again
daily_mean = daily_mean.reset_index()
# Reshape the data to long format
daily_mean_long = daily_mean.melt(id_vars='Time', var_name='Power_Type', value_name='Mean_Value')
# Set seaborn style
sns.set(style="ticks")
# Plot the data
plt.figure(figsize=(14, 7))
sns.lineplot(data=daily_mean_long, x='Time', y='Mean_Value', hue='Power_Type', linewidth=2.5)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Mean Daily Value', fontsize=14)
plt.title('Mean Daily Power Values', fontsize=16)
plt.legend(title='Power Type', fontsize=12, title_fontsize=12)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
# Improve date formatting on x-axis
plt.gca().xaxis.set_major_locator(plt.MaxNLocator(10))
plt.gcf().autofmt_xdate()
plt.tight_layout() # Adjust layout to prevent label overlap
plt.show()
# Compute additional indices (Dunn, COP, Score Function)
def compute_custom_indices(data, labels):
"""
Compute Dunn Index, COP Index, and Score Function for clustering validation.
Args:
data (np.ndarray): Feature data used for clustering.
labels (np.ndarray): Cluster labels.
Returns:
tuple: (Dunn Index, COP Index, Score Function)
"""
# Unique clusters
unique_labels = np.unique(labels)
clusters = [data[labels == k] for k in unique_labels]
# Pairwise distances
def pairwise_distances(cluster):
n = cluster.shape[0]
return np.linalg.norm(cluster[:, None, :] - cluster[None, :, :], axis=-1).flatten()
# Dunn Index
inter_cluster_distances = []
intra_cluster_distances = []
for i, cluster_i in enumerate(clusters):
# Intra-cluster distance
if len(cluster_i) > 1:
intra_cluster_distances.append(pairwise_distances(cluster_i).max())
else:
intra_cluster_distances.append(0)
# Inter-cluster distances
for j, cluster_j in enumerate(clusters):
if i != j:
dist = np.linalg.norm(
cluster_i[:, None, :] - cluster_j[None, :, :], axis=-1
).min()
inter_cluster_distances.append(dist)
dunn_index = min(inter_cluster_distances) / max(intra_cluster_distances)
# COP Index
inter_dists = []
intra_dists = []
for cluster in clusters:
if len(cluster) > 1:
intra_dists.append(pairwise_distances(cluster).mean())
else:
intra_dists.append(0)
for i, cluster_i in enumerate(clusters):
for j, cluster_j in enumerate(clusters):
if i != j:
inter_dists.append(
np.linalg.norm(
cluster_i[:, None, :] - cluster_j[None, :, :], axis=-1
).mean()
)
cop_index = sum(intra_dists) / sum(inter_dists)
# Score Function (SF) Index
score_function = (len(unique_labels) / len(data)) * (np.mean(intra_cluster_distances) / np.mean(inter_cluster_distances))
return dunn_index, cop_index, score_function
# Evaluate clustering performance
def evaluate_clustering(labels, data):
"""
Evaluate clustering performance with various validity indices.
Args:
labels (np.ndarray): Cluster labels.
data (np.ndarray): Data used for clustering.
Returns:
dict: Dictionary of validity indices.
"""
sil = silhouette_score(data, labels, metric="euclidean")
db = davies_bouldin_score(data, labels)
ch = calinski_harabasz_score(data, labels)
dunn, cop, sf = compute_custom_indices(data, labels)
return {
"Silhouette Score": sil,
"Davies-Bouldin Score": db,
"Calinski-Harabasz Score": ch,
"Dunn Index": dunn,
"COP Index": cop,
"Score Function": sf,
}
# Compute pairwise DTW distance matrix
def compute_dtw_distance_matrix(data):
n_samples = data.shape[0]
distance_matrix = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(i + 1, n_samples):
distance = dtw(data[i], data[j])
distance_matrix[i, j] = distance
distance_matrix[j, i] = distance
return distance_matrix
# Compute pairwise SBD distance matrix
def compute_sbd_distance_matrix(data):
n_samples = data.shape[0]
distance_matrix = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(i + 1, n_samples):
distance = sbd_distance(data[i], data[j])
distance_matrix[i, j] = distance
distance_matrix[j, i] = distance
return distance_matrix
# Run clustering with multiple algorithms
def run_clustering(data, algorithms, hyperparams, output_file="clustering_results.csv"):
"""
Run clustering with multiple algorithms and hyperparameters, evaluate CVI scores, and save results.
Args:
data (pd.DataFrame): Preprocessed daily aggregate DataFrame.
algorithms (dict): Dictionary of algorithms with their corresponding clustering functions.
hyperparams (dict): Dictionary of hyperparameters for each algorithm.
output_file (str): Path to save the CSV file with clustering results.
Returns:
pd.DataFrame: DataFrame containing CVI scores for all algorithm-hyperparameter combinations.
"""
results = []
features = data.drop(columns=["Date"]).values
features = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
print(features.shape)
for algo_name, algo_func in algorithms.items():
algo_params = hyperparams.get(algo_name, {})
param_grid = [dict(zip(algo_params.keys(), v)) for v in itertools.product(*algo_params.values())]
for params in param_grid:
if algo_name == "PAM" and params.get("metric") == "dtw":
distance_matrix = compute_dtw_distance_matrix(features)
model = algo_func(metric="precomputed", n_clusters=params["n_clusters"])
labels = model.fit_predict(distance_matrix)
elif algo_name in ["Single", "Average", "Complete", "Centroid", "Median"]:
distance_metric = params.get("distance_metric", "euclidean") # Default to 'euclidean' if not specified
if distance_metric == "dtw":
distance_matrix = compute_dtw_distance_matrix(features)
elif distance_metric == "sbd":
distance_matrix = compute_sbd_distance_matrix(features)
elif distance_metric == "euclidean":
distance_matrix = features # Use the features directly for Euclidean distance
else:
raise ValueError(f"Unsupported distance metric: {distance_metric}")
#model = algo_func(n_clusters=params["n_clusters"], linkage=params["linkage"], affinity="precomputed")
#labels = model.fit_predict(distance_matrix)
else:
model = algo_func(**params)
labels = model.fit_predict(features)
scores = evaluate_clustering(labels, features)
result = {"Algorithm": algo_name, **params, **scores}
results.append(result)
results_df = pd.DataFrame(results)
results_df.to_csv(output_file, index=False)
print(f"Clustering results saved to {output_file}")
return results_df
# Visualize clustering results
def visualize_clusters(daily_aggregates, labels):
"""
Visualize clustering results on the daily aggregates for all features.
Args:
daily_aggregates (pd.DataFrame): Preprocessed daily aggregates data.
labels (array-like): Cluster labels for each day.
"""
# Add cluster labels to the daily aggregates DataFrame
daily_aggregates['Cluster'] = labels
# Features to visualize
features_to_plot = [
'ess active power_sum',
'grid active power_sum',
'Consumption active power_sum',
'Production active power_sum'
]
plt.figure(figsize=(16, 8))
for idx, feature in enumerate(features_to_plot, 1):
plt.subplot(2, 2, idx)
sns.scatterplot(
x=daily_aggregates['Date'],
y=daily_aggregates[feature],
hue=daily_aggregates['Cluster'],
palette="viridis",
s=100,
edgecolor="k",
)
plt.title(f'Cluster Visualization for {feature}', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel(feature, fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()
# Main Program to run the clustering
if __name__ == "__main__":
# Load the dataset
data = pd.read_csv("/home/jvaldes/Desktop/krishna-thesis/thesiscodeimplementation/data/Autohaus 2.csv") # Replace with your dataset file
# Perform EDA
#processed_data = exploratory_data_analysis(data)
plot_daily_mean_power(data)
process_data = preprocess_for_clustering(data)
#df = daily_aggregates
# Define algorithms
algorithms = {
"PAM": lambda **kwargs: KMedoids(metric=kwargs.get("metric", "euclidean"), n_clusters=kwargs.get("n_clusters", 3)),
"KShape": lambda **kwargs: KShape(n_clusters=kwargs.get("n_clusters", 3)),
"DBA": lambda **kwargs: TimeSeriesKMeans(n_clusters=kwargs.get("n_clusters", 3), metric="dtw"),
"Single": AgglomerativeClustering,
"Average": AgglomerativeClustering,
"Complete": AgglomerativeClustering,
#"Ward": AgglomerativeClustering,
"Centroid": AgglomerativeClustering,
"Median" : AgglomerativeClustering
}
hyperparams = {
"PAM": {"n_clusters": [3, 5, 10, 15, 25], "metric": ["euclidean", "dtw"]},
"KShape": {"n_clusters": [3, 5, 10, 15, 25]},
"DBA": {"n_clusters": [3, 5, 10, 15, 25]},
"Single": {"n_clusters": [3, 5, 10, 15, 25], "distance_metric": ["euclidean", "dtw", "sbd"], "linkage": ["single"]},
"Average": {"n_clusters": [3, 5, 10, 15, 25], "distance_metric": ["euclidean", "dtw", "sbd"], "linkage": ["average"]},
"Complete": {"n_clusters": [3, 5, 10, 15, 25], "distance_metric": ["euclidean", "dtw", "sbd"], "linkage": ["complete"]},
"Centroid": {"n_clusters": [3, 5, 10, 15, 25], "distance_metric": ["euclidean", "dtw", "sbd"], "linkage": ["centroid"]},
"Median": {"n_clusters": [3, 5, 10, 15, 25], "distance_metric": ["euclidean", "dtw", "sbd"], "linkage": ["median"]},
#"Ward": {"n_clusters": [2, 3], "linkage": ["ward"]}, # Only 'linkage' should be defined
}
# Run clustering
#results_df = run_clustering(df, algorithms, hyperparams, output_file="clustering_results.csv")
\ No newline at end of file
Algorithm,n_clusters,metric,Silhouette Score,Davies-Bouldin Score,Calinski-Harabasz Score,Dunn Index,COP Index,Score Function,distance_metric,linkage,normalized_Silhouette Score,normalized_Davies-Bouldin Score,normalized_Calinski-Harabasz Score,normalized_Dunn Index,normalized_COP Index,Composite Score
PAM,3,euclidean,0.6084735745604813,0.5255388718350726,1573.8083266201857,0.0240807390692402,0.1246666615879858,0.0203745277991681,,,0.9751877966310021,0.032672444262565915,0.34416467426775516,0.17812485595468683,0.9968268201722675,0.2697173441423238
PAM,3,dtw,0.607357163367625,0.5279114718277387,1566.5655837795136,0.0256611410903551,0.1239490212386701,0.0184763444370769,,,0.9715267068803151,0.037251485657567855,0.3411610266994567,0.1958055644135517,0.9909678316692678,0.2689985449811439
PAM,5,euclidean,0.5130716869414388,0.6000071474870469,1671.6513431894605,0.0238278614463528,0.048982402770343,0.0148652136858156,,,0.6623327525458085,0.17639381233086757,0.3847412825028224,0.1752957938133408,0.37892231871349097,0.20236728607503174
PAM,5,dtw,0.4509765644908765,0.7511981091089753,1199.6940913855497,0.0081589289779459,0.0518371887259152,0.016777185398622,,,0.4587018521875172,0.4681874876056567,0.18901525951591455,0.0,0.4022294782233646,-0.005265586544595392
PAM,10,euclidean,0.4054189913993796,0.832945215289539,1346.9768732861298,0.0122418366673429,0.0176883120744429,0.0176282602403863,,,0.30930316738508196,0.6259567636233143,0.2500950989413416,0.04567742862652141,0.12342983174011934,-0.05275229939456119
PAM,10,dtw,0.4142238999219652,0.7443833865684608,1328.9120284602031,0.0122418366673429,0.0159784894220172,0.0152515917061919,,,0.33817743929676025,0.455035293091231,0.24260340298765104,0.04567742862652141,0.10947042815825138,0.0070840245060159995
PAM,15,euclidean,0.3972878068045207,1.0267532426318489,879.3026125761872,0.0122418366673429,0.0108174772007167,0.0241814513073666,,,0.2826382640128592,1.0,0.05614528091813885,0.04567742862652141,0.0673346824504273,-0.20614518999490505
PAM,15,dtw,0.443164320156949,0.7124307990760144,1609.4539126425957,0.0341916314294111,0.0075993698766979,0.0144272716945672,,,0.4330828617619301,0.3933678307049878,0.3589473029116365,0.2912402137501885,0.04106127985980647,0.1087218632884482
PAM,25,euclidean,0.3988334066413878,0.7061861709147516,743.9186095728879,0.0242748803137372,0.0066671045049809,0.024395785975029,,,0.2877068084323577,0.38131589993907455,0.0,0.1802968063179059,0.03345004102709176,-0.013398050922933651
PAM,25,dtw,0.4103016996560698,0.8153478685847727,1192.925391489275,0.0349450302124738,0.0039025469711751,0.0192137486056728,,,0.32531521856169865,0.5919944525500594,0.18620820293096738,0.2996688439712497,0.01087952806486175,-0.013883198019675956
KShape,3,,0.6160397975833883,0.5086098610813464,1422.9985136673556,0.0368580801331315,0.1234300307728639,0.01906651031821,,,1.0,0.0,0.28162213507837786,0.3210710425653404,0.9867306686178529,0.28975846441042435
KShape,5,,0.4767574584701706,0.572965049258707,1256.2560131989646,0.0128471922223984,0.0405022254328026,0.0103381595162004,,,0.5432461198508105,0.12420343570689409,0.21247212974456778,0.0524498289018261,0.3096881102810576,0.14248340305416532
KShape,10,,0.4642589675222783,0.7805246344695924,1019.7914746414634,0.0334706616360823,0.017669291409334,0.0177469377584025,,,0.5022593423971456,0.5247867348504247,0.11440760476398026,0.2831743820483903,0.1232745424530848,0.032113287176342875
KShape,15,,0.3962782482456559,0.7539017550565398,2234.038751219863,0.0213110654389683,0.006569568014584,0.012443260594478,,,0.2793275801750145,0.4734054370069865,0.6179697164997212,0.1471392008310003,0.03265372964855334,0.07681913336859734
KShape,25,,0.3111002115907659,0.8989652432496515,2205.5219477546093,0.0239624765150126,0.0025699639977045,0.0115699659443232,,,0.0,0.753373286367565,0.6061434748025761,0.176801796559063,0.0,-0.08710311129384793
DBA,3,,0.6134224523646222,0.5174393596186243,1589.636201077178,0.0488521349025612,0.1250553296795599,0.0187150458860101,,,0.9914168401250816,0.017040647148394168,0.35072867324851514,0.455254233161581,1.0,0.30798401585886737
DBA,5,,0.5262338608445839,0.5882668471260979,2032.820104653488,0.0363001795132918,0.0462952687470473,0.0123981465659697,,,0.7054959707954179,0.15373541162753895,0.5345220625168388,0.3148295432021296,0.35698390992206613,0.2682171435817378
DBA,10,,0.4594687018040139,0.6455833863810562,2760.8281676750903,0.0256235055736485,0.0121195821826519,0.0101787414312446,,,0.48655044155807814,0.26435448213162055,0.8364352598095934,0.1953845180027665,0.07796538085824607,0.24568775350430802
DBA,15,,0.4177924035155794,0.7451002406151804,3155.234410106372,0.0547580961725311,0.0062492205236841,0.010834723447843,,,0.3498797690615175,0.45641879826034937,1.0,0.5213270285570341,0.030038335645224217,0.21716716053153146
DBA,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,,,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Single,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,single,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Average,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,average,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Complete,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,complete,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Centroid,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,centroid,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,3,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,5,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,10,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,15,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,euclidean,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,dtw,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
Median,25,,0.4008805913881145,0.7555800552965729,2998.728365916879,0.0975446017744738,0.0026238206779121,0.0112687142470728,sbd,median,0.29442021935295964,0.47664450229237326,0.9350951691375857,1.0,0.0004396988971522348,0.23230777905597788
import pandas as pd
import numpy as np
# Load the data
data = pd.read_csv("/home/jvaldes/Desktop/krishna-thesis/thesiscodeimplementation/clustering_results.csv")
# Define the weights for each validation index
weights = {
"Silhouette Score": 0.3,
"Davies-Bouldin Score": -0.3, # Minimize, hence negative weight
"Calinski-Harabasz Score": 0.2,
"Dunn Index": 0.1,
"COP Index": -0.1, # Minimize, hence negative weight
}
# Normalize the scores to bring them to the same scale
def normalize(series):
return (series - series.min()) / (series.max() - series.min())
for col in weights.keys():
if col in data.columns:
data[f"normalized_{col}"] = normalize(data[col])
# Calculate a composite score
data["Composite Score"] = sum(
weights[col] * data[f"normalized_{col}"] for col in weights.keys()
if f"normalized_{col}" in data.columns
)
# Identify the best parameter combination
best_parameters = data.loc[data["Composite Score"].idxmax()]
# Display the best parameters
print("Best Parameter Combination:")
print(best_parameters)
# Save the results to a file
data.to_csv("results_with_composite_score.csv", index=False)