Skip to content
Snippets Groups Projects
Commit 23191bda authored by Sayed Saeedi's avatar Sayed Saeedi
Browse files

codes for preprocessing

parent 39404018
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:6e35f5e6-c85e-4d41-a019-ded9fef99bae tags:
``` python
import pandas as pd
import numpy as np
import os
```
%% Cell type:code id:5c0aae72-18f5-48f2-a4ff-a039da52ff35 tags:
``` python
#loading all files from the location
pd.set_option('display.max_rows', None, 'display.max_columns', None) # display unlimited number of lines
#loading files
file_names = ['02-14-2018.csv', '02-15-2018.csv', '02-16-2018.csv', '02-20-2018.csv', '02-21-2018.csv', '02-22-2018.csv', '02-23-2018.csv',
'02-28-2018.csv', '03-01-2018.csv', '03-02-2018.csv']
#loading all dataset in a dictionary
dfs={}
for file in file_names:
df=pd.read_csv(f'~/Datasets/{file}')
dfs[file]=df
#Droping ["Flow ID", "Src IP", "Src Port", "Dst IP"] columns from 02-20-2018 file
dfs['02-20-2018.csv'].drop(['Flow ID', 'Src IP', 'Src Port', 'Dst IP'], axis=1, inplace=True)
```
%% Cell type:code id:ad3d8a3d-1043-4d2a-82dd-0ddd20f615f3 tags:
``` python
def print_label_counts(dfs):
"""
counting the different categories in each Label
Parameters:
- dfs: Dictionary of DataFrames.
"""
for key in dfs.keys():
df = dfs[key] # Get the dataframe corresponding to the key
count = df['Label'].value_counts() # Perform value count on the 'Label' column
print(f"Value counts for dataframe '{key}':\n{count}\n")
print_label_counts(dfs)
```
%% Cell type:code id:6fc6eb2d-8451-4cc4-9ae0-029182366994 tags:
``` python
#preprocessing
"""
-Deleting duplicates
-changing inf and -inf to NaN
-changing time to unix format
-changing data types to numeric except for the 'Label' column
-dropping Na
-dropping negative values except ['Init Bwd Win Byts', 'Init Fwd Win Byts']
"""
for key in dfs.keys():
df=dfs[key]
print(f"Dataframe: '{key}', shape before preprocessing: {df.shape}")
df.drop_duplicates(inplace=True)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
df['Timestamp'] = (df['Timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
for col in df.columns: #changing to numeric if not, then to NaN
if df[col].dtype == 'object' and col != 'Label':
df[col] = pd.to_numeric(df[col], errors='coerce')
df.dropna(inplace=True)
for col in df.columns:
if col not in ['Init Bwd Win Byts', 'Init Fwd Win Byts', 'Label']:
df.loc[df[col] < 0, col] = np.nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
print(f"shape after preprocessing: {df.shape}\n")
```
%% Cell type:code id:1c3d70f5-7052-4e6c-bdce-fe36dff94008 tags:
``` python
def aggregate_classes(dfs, classes):
"""
Aggregates traffic data into separate DataFrames based on specified labels.
Parameters:
- dfs: Dictionary of DataFrames loaded from CSV files.
- classes: Dictionary with keys being traffic category and values being lists of labels associated with that category.
Returns:
- A dictionary of aggregated DataFrames for each category.
"""
aggregated_data = {}
for category, labels in classes.items():
aggregated_data[category] = pd.DataFrame()
for label in labels:
# Iterating through all DataFrames to filter and aggregate the different labels
for key in dfs:
df = dfs[key]
filtered_df = df[df["Label"] == label]
aggregated_data[category] = pd.concat([aggregated_data[category], filtered_df], axis=0, ignore_index=True)
return aggregated_data
# the specified labels
classes = {
"BruteForce": ["FTP-BruteForce", "SSH-Bruteforce", "Brute Force -Web", "Brute Force -XSS"],
"DoS": ["DoS attacks-GoldenEye", "DoS attacks-Slowloris", "DoS attacks-Hulk", "DoS attacks-SlowHTTPTest", "DDoS attacks-LOIC-HTTP", "DDOS attack-HOIC", "DDOS attack-LOIC-UDP"],
"Infiltration": ["Infilteration"],
"Bot": ["Bot"],
"Benign": ["Benign"]
}
aggregated_data = aggregate_classes(dfs, classes)
```
%% Cell type:code id:b7062976-df28-43f5-8a86-226c87838123 tags:
``` python
# Access the aggregated DataFrames for each category
bruteforce_attacks = aggregated_data["BruteForce"]
doS_attacks = aggregated_data["DoS"]
infiltration_attacks = aggregated_data["Infiltration"]
bot_attacks = aggregated_data["Bot"]
benign = aggregated_data["Benign"]
```
%% Cell type:code id:ec01bd77-2f10-42fb-b48f-babe38204108 tags:
``` python
#Saving each traffic category
save_directory = 'Datasets/Preprocessed_Datasets'
bruteforce_attacks.to_csv(f'{save_directory}/bruteforce_attacks.csv', index=False)
doS_attacks.to_csv(f'{save_directory}/doS_attacks.csv', index=False)
infiltration_attacks.to_csv(f'{save_directory}/infiltration_attacks.csv', index=False)
bot_attacks.to_csv(f'{save_directory}/bot_attacks.csv', index=False)
benign.to_csv(f'{save_directory}/benign.csv', index=False)
```
%% Cell type:code id:8806c851-f3a6-4962-9c4b-0e2e226a6901 tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment