From 7bc11088f1af3fa22dbea4a0c6714c4c79dace9d Mon Sep 17 00:00:00 2001
From: Aida Nikkhah Nasab <aida.nikkhah-nasab@stud.th-deg.de>
Date: Tue, 11 Mar 2025 23:26:03 +0100
Subject: [PATCH] add data insertion and analysis scripts for InfluxDB and
 unique URL counting

---
 ...relation_beacons - Copy.py:Zone.Identifier |  0
 .../{insert-data-.py => Insert_Data.py}       |  0
 Codes/unic_urls/most_visited_urls.py          | 93 +++++++++++++++++++
 Thesis_Docs/README.md                         | 42 ++++++---
 4 files changed, 122 insertions(+), 13 deletions(-)
 delete mode 100644 Codes/FTT_autocorrelation/FFT_AutoCorrelation_beacons - Copy.py:Zone.Identifier
 rename Codes/insert_data/{insert-data-.py => Insert_Data.py} (100%)
 create mode 100644 Codes/unic_urls/most_visited_urls.py

diff --git a/Codes/FTT_autocorrelation/FFT_AutoCorrelation_beacons - Copy.py:Zone.Identifier b/Codes/FTT_autocorrelation/FFT_AutoCorrelation_beacons - Copy.py:Zone.Identifier
deleted file mode 100644
index e69de29..0000000
diff --git a/Codes/insert_data/insert-data-.py b/Codes/insert_data/Insert_Data.py
similarity index 100%
rename from Codes/insert_data/insert-data-.py
rename to Codes/insert_data/Insert_Data.py
diff --git a/Codes/unic_urls/most_visited_urls.py b/Codes/unic_urls/most_visited_urls.py
new file mode 100644
index 0000000..59e0601
--- /dev/null
+++ b/Codes/unic_urls/most_visited_urls.py
@@ -0,0 +1,93 @@
+import os
+import json
+# import csv
+import matplotlib.pyplot as plt
+
+def count_unique_urls_per_ip(folder_path):
+    """
+    Reads all JSON files in a folder, extracts unique URLs for each IP (file name),
+    and returns a dictionary mapping IPs to the count of unique URLs they visited.
+    """
+    ip_url_count = {}
+
+    # Loop through all files in the folder
+    for file_name in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, file_name)
+        
+        if os.path.isfile(file_path) and file_name.count('.') >= 3:  # Basic IP format check
+            unique_urls = set()
+            
+            try:
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    for line in file:
+                        try:
+                            log_entry = json.loads(line)
+                            url = log_entry.get("url_hostname")
+                            if url:
+                                unique_urls.add(url)
+                        except json.JSONDecodeError:
+                            continue
+            
+            except Exception as e:
+                print(f"Error reading file {file_name}: {e}")
+                continue
+            
+            ip_url_count[file_name] = len(unique_urls)  # Store count of unique URLs
+
+    return dict(sorted(ip_url_count.items(), key=lambda item: item[1], reverse=True))  # Sort in decreasing order
+
+# def save_csv(ip_url_count, output_csv_path):
+    # """
+    # Saves the sorted IP-to-unique-URL count mapping into a CSV file.
+    # """
+    #with open(output_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
+     #   csv_writer = csv.writer(csv_file)
+    #    csv_writer.writerow(["IP Address", "Unique URL Count"])
+   #     
+  #      for ip, count in ip_url_count.items():
+ #           csv_writer.writerow([ip, count])
+    
+    # print(f"Results saved to {output_csv_path}") 
+
+def generate_chart(ip_url_count, output_chart_path):
+    """
+    Generates a sorted bar chart with IPs on X-axis (hidden) and unique URL counts on Y-axis.
+    """
+    if not ip_url_count:
+        print("No data available for chart.")
+        return
+
+    ips = list(ip_url_count.keys())
+    url_counts = list(ip_url_count.values())
+
+    # Create the bar chart
+    plt.figure(figsize=(14, 6))
+    plt.bar(range(len(ips)), url_counts, color='blue', edgecolor='black')
+    plt.xlabel('IP Addresses (Hidden)')
+    plt.ylabel('Unique URLs Visited')
+    plt.title('Number of Unique URLs Visited by Each IP')
+    
+    # Hide X-axis labels for better visualization
+    plt.xticks([])  
+
+    plt.tight_layout()
+
+    # Save the chart
+    plt.savefig(output_chart_path)
+    print(f"Chart saved to {output_chart_path}")
+    plt.show()
+
+if __name__ == "__main__":
+    # Define input and output paths
+    folder_path = r"C:\Users\aydan\Downloads\export\EXPORT"
+    # output_csv_path = r"C:\Users\aydan\Desktop\MT\MT2\ip_url_counts.csv"
+    output_chart_path = r"C:\Users\aydan\Desktop\MT\MT2\ip_url_chart.png"
+
+    # Process JSON files
+    ip_url_count = count_unique_urls_per_ip(folder_path)
+
+    # Save to CSV
+    # save_csv(ip_url_count, output_csv_path)
+
+    # Generate bar chart
+    generate_chart(ip_url_count, output_chart_path)
diff --git a/Thesis_Docs/README.md b/Thesis_Docs/README.md
index 6cc6ac0..212dfb8 100644
--- a/Thesis_Docs/README.md
+++ b/Thesis_Docs/README.md
@@ -1,19 +1,35 @@
-# Studienarbeiten
+# Master’s Thesis: Performance Evaluation of the BAYWATCH Framework
 
-Dieses Repository enthält eine Vorlage für eine Studienarbeit in LaTeX. Der Student / die Studentin
-muss in `thesis.tex` die Rahmendaten anpassen. Die eigentliche Arbeit kann in `main.tex`
-oder (bei Bedarf) in weiteren `.tex`-Dateien erstellt werden. Die Arbeit kann mittels des Kommandos
-`make` kompiliert werden. Die Vorlage entspricht vorrangig den Vorgaben der Fakultät AI, sollte aber
-problemlos an andere Vorgaben anpassbar sein. Selbstverständlich sollte die Verwendung der Vorlage
-sowie notwendige Anpassungen vorab mit dem Betreuer der Arbeit geklärt werden.
+This repository contains the work for my Master’s Thesis, which evaluates the performance of the BAYWATCH Framework on both real and synthetic data. The project involves data analysis, synthetic data generation, and performance testing of the framework.
 
-## Beispiel
+## Project Overview
 
-![Titelseite einer Arbeit](Titlepage.png "Titelseite einer Arbeit")
+The main objective of this thesis is to assess how well the BAYWATCH Framework performs on different datasets. The process involves:
 
----
-© 2021 [Andreas Fischer](mailto:andreas.fischer@th-deg.de)
+1. Data Preparation  
+   - Setting up an InfluxDB database.  
+   - Importing real network activity data into the database.  
+   - Due to the large dataset size, a data format file is provided to show the structure of the data.  
 
-<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons Lizenzvertrag" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br />Dieses Werk ist lizenziert unter einer <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Namensnennung - Weitergabe unter gleichen Bedingungen 4.0 International Lizenz</a>.
+2. Data Analysis  
+   - Analyzing the imported data to identify patterns and trends.  
 
-Eine reine Erstellung einer Studienarbeit stellt noch keine Bearbeitung der Vorlage dar. Die entsprechende Studienarbeit unterliegt damit nicht der Vorgabe zur Weitergabe unter gleichen Bedingungen.
+3. Synthetic Data Generation  
+   - Creating synthetic datasets based on the observed patterns in the real data.  
+
+4. Performance Evaluation  
+   - Testing the BAYWATCH Framework on both real and synthetic data.  
+   - Comparing results to assess its effectiveness.  
+
+## Repository Structure
+
+- /src – Contains scripts for data import, analysis, and performance evaluation.  
+- /data – Includes the data format file (the actual dataset is too large to store).  
+- /results – Stores output files and performance metrics.  
+- /docs – Thesis document, presentation slides, and additional documentation.  
+
+## Setup Instructions
+
+1. Clone the Repository  
+   ```bash
+   git clone https://mygit.th-deg.de/an28964/master-thesis.git
-- 
GitLab