Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MasterThesis
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Sayed Saeedi
MasterThesis
Commits
23191bda
Commit
23191bda
authored
1 year ago
by
Sayed Saeedi
Browse files
Options
Downloads
Patches
Plain Diff
codes for preprocessing
parent
39404018
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Preprocessing/All_the_datasets.ipynb
+209
-0
209 additions, 0 deletions
Preprocessing/All_the_datasets.ipynb
with
209 additions
and
0 deletions
Preprocessing/All_the_datasets.ipynb
0 → 100644
+
209
−
0
View file @
23191bda
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "6e35f5e6-c85e-4d41-a019-ded9fef99bae",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5c0aae72-18f5-48f2-a4ff-a039da52ff35",
"metadata": {},
"outputs": [],
"source": [
"#loading all files from the location\n",
"pd.set_option('display.max_rows', None, 'display.max_columns', None) # display unlimited number of lines\n",
"#loading files\n",
"file_names = ['02-14-2018.csv', '02-15-2018.csv', '02-16-2018.csv', '02-20-2018.csv', '02-21-2018.csv', '02-22-2018.csv', '02-23-2018.csv',\n",
" '02-28-2018.csv', '03-01-2018.csv', '03-02-2018.csv']\n",
"\n",
"#loading all dataset in a dictionary\n",
"dfs={}\n",
"for file in file_names:\n",
" df=pd.read_csv(f'~/Datasets/{file}')\n",
" dfs[file]=df\n",
" \n",
"#Droping [\"Flow ID\", \"Src IP\", \"Src Port\", \"Dst IP\"] columns from 02-20-2018 file\n",
"dfs['02-20-2018.csv'].drop(['Flow ID', 'Src IP', 'Src Port', 'Dst IP'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad3d8a3d-1043-4d2a-82dd-0ddd20f615f3",
"metadata": {},
"outputs": [],
"source": [
"def print_label_counts(dfs):\n",
" \"\"\"\n",
" counting the different categories in each Label\n",
" Parameters:\n",
" - dfs: Dictionary of DataFrames.\n",
" \"\"\"\n",
" for key in dfs.keys():\n",
" df = dfs[key] # Get the dataframe corresponding to the key\n",
" count = df['Label'].value_counts() # Perform value count on the 'Label' column\n",
" print(f\"Value counts for dataframe '{key}':\\n{count}\\n\")\n",
"\n",
"print_label_counts(dfs)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fc6eb2d-8451-4cc4-9ae0-029182366994",
"metadata": {},
"outputs": [],
"source": [
"#preprocessing\n",
"\"\"\"\n",
"-Deleting duplicates\n",
"-changing inf and -inf to NaN\n",
"-changing time to unix format\n",
"-changing data types to numeric except for the 'Label' column\n",
"-dropping Na\n",
"-dropping negative values except ['Init Bwd Win Byts', 'Init Fwd Win Byts']\n",
"\"\"\"\n",
"for key in dfs.keys():\n",
" df=dfs[key]\n",
" print(f\"Dataframe: '{key}', shape before preprocessing: {df.shape}\")\n",
" df.drop_duplicates(inplace=True) \n",
" df.replace([np.inf, -np.inf], np.nan, inplace=True) \n",
" \n",
" df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')\n",
" df['Timestamp'] = (df['Timestamp'] - pd.Timestamp(\"1970-01-01\")) // pd.Timedelta('1s') \n",
" \n",
" for col in df.columns: #changing to numeric if not, then to NaN\n",
" if df[col].dtype == 'object' and col != 'Label':\n",
" df[col] = pd.to_numeric(df[col], errors='coerce')\n",
"\n",
"\n",
" df.dropna(inplace=True)\n",
"\n",
" for col in df.columns:\n",
" if col not in ['Init Bwd Win Byts', 'Init Fwd Win Byts', 'Label']:\n",
" df.loc[df[col] < 0, col] = np.nan\n",
"\n",
" df.replace([np.inf, -np.inf], np.nan, inplace=True)\n",
" \n",
" df.dropna(inplace=True)\n",
" print(f\"shape after preprocessing: {df.shape}\\n\") \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c3d70f5-7052-4e6c-bdce-fe36dff94008",
"metadata": {},
"outputs": [],
"source": [
"def aggregate_classes(dfs, classes):\n",
" \"\"\"\n",
" Aggregates traffic data into separate DataFrames based on specified labels.\n",
"\n",
" Parameters:\n",
" - dfs: Dictionary of DataFrames loaded from CSV files.\n",
" - classes: Dictionary with keys being traffic category and values being lists of labels associated with that category.\n",
"\n",
" Returns:\n",
" - A dictionary of aggregated DataFrames for each category.\n",
" \"\"\"\n",
" aggregated_data = {}\n",
" \n",
" for category, labels in classes.items():\n",
" aggregated_data[category] = pd.DataFrame()\n",
" \n",
" for label in labels:\n",
" # Iterating through all DataFrames to filter and aggregate the different labels\n",
" for key in dfs:\n",
" df = dfs[key]\n",
" filtered_df = df[df[\"Label\"] == label]\n",
" aggregated_data[category] = pd.concat([aggregated_data[category], filtered_df], axis=0, ignore_index=True)\n",
" \n",
" return aggregated_data\n",
"\n",
"\n",
"# the specified labels\n",
"classes = {\n",
" \"BruteForce\": [\"FTP-BruteForce\", \"SSH-Bruteforce\", \"Brute Force -Web\", \"Brute Force -XSS\"],\n",
" \"DoS\": [\"DoS attacks-GoldenEye\", \"DoS attacks-Slowloris\", \"DoS attacks-Hulk\", \"DoS attacks-SlowHTTPTest\", \"DDoS attacks-LOIC-HTTP\", \"DDOS attack-HOIC\", \"DDOS attack-LOIC-UDP\"],\n",
" \"Infiltration\": [\"Infilteration\"],\n",
" \"Bot\": [\"Bot\"],\n",
" \"Benign\": [\"Benign\"]\n",
"}\n",
"\n",
"aggregated_data = aggregate_classes(dfs, classes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b7062976-df28-43f5-8a86-226c87838123",
"metadata": {},
"outputs": [],
"source": [
"# Access the aggregated DataFrames for each category\n",
"bruteforce_attacks = aggregated_data[\"BruteForce\"]\n",
"doS_attacks = aggregated_data[\"DoS\"]\n",
"infiltration_attacks = aggregated_data[\"Infiltration\"]\n",
"bot_attacks = aggregated_data[\"Bot\"]\n",
"benign = aggregated_data[\"Benign\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec01bd77-2f10-42fb-b48f-babe38204108",
"metadata": {},
"outputs": [],
"source": [
"#Saving each traffic category\n",
"save_directory = 'Datasets/Preprocessed_Datasets'\n",
"\n",
"bruteforce_attacks.to_csv(f'{save_directory}/bruteforce_attacks.csv', index=False)\n",
"doS_attacks.to_csv(f'{save_directory}/doS_attacks.csv', index=False)\n",
"infiltration_attacks.to_csv(f'{save_directory}/infiltration_attacks.csv', index=False)\n",
"bot_attacks.to_csv(f'{save_directory}/bot_attacks.csv', index=False)\n",
"benign.to_csv(f'{save_directory}/benign.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8806c851-f3a6-4962-9c4b-0e2e226a6901",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:6e35f5e6-c85e-4d41-a019-ded9fef99bae tags:
```
python
import
pandas
as
pd
import
numpy
as
np
import
os
```
%% Cell type:code id:5c0aae72-18f5-48f2-a4ff-a039da52ff35 tags:
```
python
#loading all files from the location
pd
.
set_option
(
'
display.max_rows
'
,
None
,
'
display.max_columns
'
,
None
)
# display unlimited number of lines
#loading files
file_names
=
[
'
02-14-2018.csv
'
,
'
02-15-2018.csv
'
,
'
02-16-2018.csv
'
,
'
02-20-2018.csv
'
,
'
02-21-2018.csv
'
,
'
02-22-2018.csv
'
,
'
02-23-2018.csv
'
,
'
02-28-2018.csv
'
,
'
03-01-2018.csv
'
,
'
03-02-2018.csv
'
]
#loading all dataset in a dictionary
dfs
=
{}
for
file
in
file_names
:
df
=
pd
.
read_csv
(
f
'
~/Datasets/
{
file
}
'
)
dfs
[
file
]
=
df
#Droping ["Flow ID", "Src IP", "Src Port", "Dst IP"] columns from 02-20-2018 file
dfs
[
'
02-20-2018.csv
'
].
drop
([
'
Flow ID
'
,
'
Src IP
'
,
'
Src Port
'
,
'
Dst IP
'
],
axis
=
1
,
inplace
=
True
)
```
%% Cell type:code id:ad3d8a3d-1043-4d2a-82dd-0ddd20f615f3 tags:
```
python
def
print_label_counts
(
dfs
):
"""
counting the different categories in each Label
Parameters:
- dfs: Dictionary of DataFrames.
"""
for
key
in
dfs
.
keys
():
df
=
dfs
[
key
]
# Get the dataframe corresponding to the key
count
=
df
[
'
Label
'
].
value_counts
()
# Perform value count on the 'Label' column
print
(
f
"
Value counts for dataframe
'
{
key
}
'
:
\n
{
count
}
\n
"
)
print_label_counts
(
dfs
)
```
%% Cell type:code id:6fc6eb2d-8451-4cc4-9ae0-029182366994 tags:
```
python
#preprocessing
"""
-Deleting duplicates
-changing inf and -inf to NaN
-changing time to unix format
-changing data types to numeric except for the
'
Label
'
column
-dropping Na
-dropping negative values except [
'
Init Bwd Win Byts
'
,
'
Init Fwd Win Byts
'
]
"""
for
key
in
dfs
.
keys
():
df
=
dfs
[
key
]
print
(
f
"
Dataframe:
'
{
key
}
'
, shape before preprocessing:
{
df
.
shape
}
"
)
df
.
drop_duplicates
(
inplace
=
True
)
df
.
replace
([
np
.
inf
,
-
np
.
inf
],
np
.
nan
,
inplace
=
True
)
df
[
'
Timestamp
'
]
=
pd
.
to_datetime
(
df
[
'
Timestamp
'
],
format
=
'
%d/%m/%Y %H:%M:%S
'
,
errors
=
'
coerce
'
)
df
[
'
Timestamp
'
]
=
(
df
[
'
Timestamp
'
]
-
pd
.
Timestamp
(
"
1970-01-01
"
))
//
pd
.
Timedelta
(
'
1s
'
)
for
col
in
df
.
columns
:
#changing to numeric if not, then to NaN
if
df
[
col
].
dtype
==
'
object
'
and
col
!=
'
Label
'
:
df
[
col
]
=
pd
.
to_numeric
(
df
[
col
],
errors
=
'
coerce
'
)
df
.
dropna
(
inplace
=
True
)
for
col
in
df
.
columns
:
if
col
not
in
[
'
Init Bwd Win Byts
'
,
'
Init Fwd Win Byts
'
,
'
Label
'
]:
df
.
loc
[
df
[
col
]
<
0
,
col
]
=
np
.
nan
df
.
replace
([
np
.
inf
,
-
np
.
inf
],
np
.
nan
,
inplace
=
True
)
df
.
dropna
(
inplace
=
True
)
print
(
f
"
shape after preprocessing:
{
df
.
shape
}
\n
"
)
```
%% Cell type:code id:1c3d70f5-7052-4e6c-bdce-fe36dff94008 tags:
```
python
def
aggregate_classes
(
dfs
,
classes
):
"""
Aggregates traffic data into separate DataFrames based on specified labels.
Parameters:
- dfs: Dictionary of DataFrames loaded from CSV files.
- classes: Dictionary with keys being traffic category and values being lists of labels associated with that category.
Returns:
- A dictionary of aggregated DataFrames for each category.
"""
aggregated_data
=
{}
for
category
,
labels
in
classes
.
items
():
aggregated_data
[
category
]
=
pd
.
DataFrame
()
for
label
in
labels
:
# Iterating through all DataFrames to filter and aggregate the different labels
for
key
in
dfs
:
df
=
dfs
[
key
]
filtered_df
=
df
[
df
[
"
Label
"
]
==
label
]
aggregated_data
[
category
]
=
pd
.
concat
([
aggregated_data
[
category
],
filtered_df
],
axis
=
0
,
ignore_index
=
True
)
return
aggregated_data
# the specified labels
classes
=
{
"
BruteForce
"
:
[
"
FTP-BruteForce
"
,
"
SSH-Bruteforce
"
,
"
Brute Force -Web
"
,
"
Brute Force -XSS
"
],
"
DoS
"
:
[
"
DoS attacks-GoldenEye
"
,
"
DoS attacks-Slowloris
"
,
"
DoS attacks-Hulk
"
,
"
DoS attacks-SlowHTTPTest
"
,
"
DDoS attacks-LOIC-HTTP
"
,
"
DDOS attack-HOIC
"
,
"
DDOS attack-LOIC-UDP
"
],
"
Infiltration
"
:
[
"
Infilteration
"
],
"
Bot
"
:
[
"
Bot
"
],
"
Benign
"
:
[
"
Benign
"
]
}
aggregated_data
=
aggregate_classes
(
dfs
,
classes
)
```
%% Cell type:code id:b7062976-df28-43f5-8a86-226c87838123 tags:
```
python
# Access the aggregated DataFrames for each category
bruteforce_attacks
=
aggregated_data
[
"
BruteForce
"
]
doS_attacks
=
aggregated_data
[
"
DoS
"
]
infiltration_attacks
=
aggregated_data
[
"
Infiltration
"
]
bot_attacks
=
aggregated_data
[
"
Bot
"
]
benign
=
aggregated_data
[
"
Benign
"
]
```
%% Cell type:code id:ec01bd77-2f10-42fb-b48f-babe38204108 tags:
```
python
#Saving each traffic category
save_directory
=
'
Datasets/Preprocessed_Datasets
'
bruteforce_attacks
.
to_csv
(
f
'
{
save_directory
}
/bruteforce_attacks.csv
'
,
index
=
False
)
doS_attacks
.
to_csv
(
f
'
{
save_directory
}
/doS_attacks.csv
'
,
index
=
False
)
infiltration_attacks
.
to_csv
(
f
'
{
save_directory
}
/infiltration_attacks.csv
'
,
index
=
False
)
bot_attacks
.
to_csv
(
f
'
{
save_directory
}
/bot_attacks.csv
'
,
index
=
False
)
benign
.
to_csv
(
f
'
{
save_directory
}
/benign.csv
'
,
index
=
False
)
```
%% Cell type:code id:8806c851-f3a6-4962-9c4b-0e2e226a6901 tags:
```
python
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment