Updated the database script to remove duplicate entries: keeps only the best-performing cases for a specific parameters combination

author: Cedric Nugteren <web@cedricnugteren.nl> 2016-09-10 11:12:09 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2016-09-10 11:12:09 +0200
commit: 3daba709974fa2b13b2c44be8e555f2bc6d8356a (patch)
tree: 80bd36aea06d7a836d9d1ce91dd89a985d7e7602 /scripts/database
parent: 55038d3c919a6584e5e5891e2290c67698f3c90d (diff)
3 files changed, 26 insertions, 16 deletions
diff --git a/scripts/database/database.py b/scripts/database/database.py
index 6d370d99..944c1bd6 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -77,12 +77,12 @@ def main(argv):
         # Adds the new data to the database
         old_size = len(database.index)
         database = db.concatenate_database(database, imported_data)
-        database = db.remove_duplicates(database)
         new_size = len(database.index)
         print("with " + str(new_size - old_size) + " new items")  # Newline printed here
 
     # Stores the modified database back to disk
     if len(glob.glob(json_files)) >= 1:
+        database = db.remove_duplicates(database)
         io.save_database(database, database_filename)
 
     # Optional: update the database here. Default is disabled, code below is just an example
diff --git a/scripts/database/database/db.py b/scripts/database/database/db.py
index 60cfbcfa..6534d689 100644
--- a/scripts/database/database/db.py
+++ b/scripts/database/database/db.py
@@ -6,6 +6,7 @@
 #   Cedric Nugteren <www.cedricnugteren.nl>
 
 import pandas as pd
+import numpy as np
 
 
 def get_entries_by_field(database, field, value):
@@ -18,11 +19,6 @@ def concatenate_database(database1, database2):
     return pd.concat([database1, database2])
 
 
-def remove_duplicates(database):
-    """Removes duplicates from a database"""
-    return database.drop_duplicates()
-
-
 def find_and_replace(database, dictionary):
     """Finds and replaces entries in a database based on a dictionary. Example:
     dictionary = { "key_to_edit": { find1: replace1, find2, replace2 } }"""
@@ -48,3 +44,27 @@ def update_database(database, condition, field, value):
     """Updates the database by writing a specific value to a given field, given certain conditions"""
     database.loc[condition, field] = value
     return database
+
+
+def remove_duplicates(database):
+    """Removes duplicates from the database based on all but the 'time' column"""
+
+    # First remove 100% duplicate entries
+    database = database.drop_duplicates()
+
+    # Replace NaNs with -1 first (needed for groupby)
+    database = database.replace(np.nan, -1)
+
+    # In case multiple runs for the exact same configuration where made: take just the best performing one into account
+    other_column_names = list(database.columns.values)
+    other_column_names.remove("time")
+    database_by_time = database.groupby(other_column_names,)
+    num_removals = len(database) - len(database_by_time)
+    if num_removals > 0:
+        print("[database] Removing %d entries: keeping only those with the lowest execution time" % num_removals)
+        print("[database] Note: this might take a while")
+        database = database_by_time.apply(lambda x: x[x["time"] == x["time"].min()])
+
+    # Re-replace the NaN values
+    database = database.replace(-1, np.nan)
+    return database
diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index 3bde33c1..d71e604f 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -81,16 +81,6 @@ def get_common_best(database, group_name, verbose):
 
     # Removes columns without any values
     database = database.dropna(axis=1, how='all')
-    database = database.reset_index()
-
-    # In case multiple runs for the exact same configuration where made: take just the best performing one into account
-    other_column_names = list(database.columns.values)
-    other_column_names.remove("time")
-    database_by_time = database.groupby(other_column_names)
-    if len(database_by_time) != len(database):
-        if verbose:
-            print("[database] " + str(group_name) + " keeping only entries with the lowest execution time")
-        database = database_by_time.apply(lambda x: x[x["time"] == x["time"].min()])
 
     # Inserts the relative execution times into the database
     def relative_performance(x):
author	Cedric Nugteren <web@cedricnugteren.nl>	2016-09-10 11:12:09 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2016-09-10 11:12:09 +0200
commit	3daba709974fa2b13b2c44be8e555f2bc6d8356a (patch)
tree	80bd36aea06d7a836d9d1ce91dd89a985d7e7602 /scripts/database
parent	55038d3c919a6584e5e5891e2290c67698f3c90d (diff)