summaryrefslogtreecommitdiff
path: root/scripts/database
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-09-10 11:12:09 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-09-10 11:12:09 +0200
commit3daba709974fa2b13b2c44be8e555f2bc6d8356a (patch)
tree80bd36aea06d7a836d9d1ce91dd89a985d7e7602 /scripts/database
parent55038d3c919a6584e5e5891e2290c67698f3c90d (diff)
Updated the database script to remove duplicate entries: keeps only the best-performing cases for a specific parameters combination
Diffstat (limited to 'scripts/database')
-rwxr-xr-xscripts/database/database.py2
-rw-r--r--scripts/database/database/db.py30
-rw-r--r--scripts/database/database/defaults.py10
3 files changed, 26 insertions, 16 deletions
diff --git a/scripts/database/database.py b/scripts/database/database.py
index 6d370d99..944c1bd6 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -77,12 +77,12 @@ def main(argv):
# Adds the new data to the database
old_size = len(database.index)
database = db.concatenate_database(database, imported_data)
- database = db.remove_duplicates(database)
new_size = len(database.index)
print("with " + str(new_size - old_size) + " new items") # Newline printed here
# Stores the modified database back to disk
if len(glob.glob(json_files)) >= 1:
+ database = db.remove_duplicates(database)
io.save_database(database, database_filename)
# Optional: update the database here. Default is disabled, code below is just an example
diff --git a/scripts/database/database/db.py b/scripts/database/database/db.py
index 60cfbcfa..6534d689 100644
--- a/scripts/database/database/db.py
+++ b/scripts/database/database/db.py
@@ -6,6 +6,7 @@
# Cedric Nugteren <www.cedricnugteren.nl>
import pandas as pd
+import numpy as np
def get_entries_by_field(database, field, value):
@@ -18,11 +19,6 @@ def concatenate_database(database1, database2):
return pd.concat([database1, database2])
-def remove_duplicates(database):
- """Removes duplicates from a database"""
- return database.drop_duplicates()
-
-
def find_and_replace(database, dictionary):
"""Finds and replaces entries in a database based on a dictionary. Example:
dictionary = { "key_to_edit": { find1: replace1, find2, replace2 } }"""
@@ -48,3 +44,27 @@ def update_database(database, condition, field, value):
"""Updates the database by writing a specific value to a given field, given certain conditions"""
database.loc[condition, field] = value
return database
+
+
+def remove_duplicates(database):
+ """Removes duplicates from the database based on all but the 'time' column"""
+
+ # First remove 100% duplicate entries
+ database = database.drop_duplicates()
+
+ # Replace NaNs with -1 first (needed for groupby)
+ database = database.replace(np.nan, -1)
+
+ # In case multiple runs for the exact same configuration where made: take just the best performing one into account
+ other_column_names = list(database.columns.values)
+ other_column_names.remove("time")
+ database_by_time = database.groupby(other_column_names,)
+ num_removals = len(database) - len(database_by_time)
+ if num_removals > 0:
+ print("[database] Removing %d entries: keeping only those with the lowest execution time" % num_removals)
+ print("[database] Note: this might take a while")
+ database = database_by_time.apply(lambda x: x[x["time"] == x["time"].min()])
+
+ # Re-replace the NaN values
+ database = database.replace(-1, np.nan)
+ return database
diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index 3bde33c1..d71e604f 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -81,16 +81,6 @@ def get_common_best(database, group_name, verbose):
# Removes columns without any values
database = database.dropna(axis=1, how='all')
- database = database.reset_index()
-
- # In case multiple runs for the exact same configuration where made: take just the best performing one into account
- other_column_names = list(database.columns.values)
- other_column_names.remove("time")
- database_by_time = database.groupby(other_column_names)
- if len(database_by_time) != len(database):
- if verbose:
- print("[database] " + str(group_name) + " keeping only entries with the lowest execution time")
- database = database_by_time.apply(lambda x: x[x["time"] == x["time"].min()])
# Inserts the relative execution times into the database
def relative_performance(x):