diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-09-10 11:12:09 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-09-10 11:12:09 +0200 |
commit | 3daba709974fa2b13b2c44be8e555f2bc6d8356a (patch) | |
tree | 80bd36aea06d7a836d9d1ce91dd89a985d7e7602 /scripts/database | |
parent | 55038d3c919a6584e5e5891e2290c67698f3c90d (diff) |
Updated the database script to remove duplicate entries: keeps only the best-performing cases for a specific parameters combination
Diffstat (limited to 'scripts/database')
-rwxr-xr-x | scripts/database/database.py | 2 | ||||
-rw-r--r-- | scripts/database/database/db.py | 30 | ||||
-rw-r--r-- | scripts/database/database/defaults.py | 10 |
3 files changed, 26 insertions, 16 deletions
diff --git a/scripts/database/database.py b/scripts/database/database.py index 6d370d99..944c1bd6 100755 --- a/scripts/database/database.py +++ b/scripts/database/database.py @@ -77,12 +77,12 @@ def main(argv): # Adds the new data to the database old_size = len(database.index) database = db.concatenate_database(database, imported_data) - database = db.remove_duplicates(database) new_size = len(database.index) print("with " + str(new_size - old_size) + " new items") # Newline printed here # Stores the modified database back to disk if len(glob.glob(json_files)) >= 1: + database = db.remove_duplicates(database) io.save_database(database, database_filename) # Optional: update the database here. Default is disabled, code below is just an example diff --git a/scripts/database/database/db.py b/scripts/database/database/db.py index 60cfbcfa..6534d689 100644 --- a/scripts/database/database/db.py +++ b/scripts/database/database/db.py @@ -6,6 +6,7 @@ # Cedric Nugteren <www.cedricnugteren.nl> import pandas as pd +import numpy as np def get_entries_by_field(database, field, value): @@ -18,11 +19,6 @@ def concatenate_database(database1, database2): return pd.concat([database1, database2]) -def remove_duplicates(database): - """Removes duplicates from a database""" - return database.drop_duplicates() - - def find_and_replace(database, dictionary): """Finds and replaces entries in a database based on a dictionary. Example: dictionary = { "key_to_edit": { find1: replace1, find2, replace2 } }""" @@ -48,3 +44,27 @@ def update_database(database, condition, field, value): """Updates the database by writing a specific value to a given field, given certain conditions""" database.loc[condition, field] = value return database + + +def remove_duplicates(database): + """Removes duplicates from the database based on all but the 'time' column""" + + # First remove 100% duplicate entries + database = database.drop_duplicates() + + # Replace NaNs with -1 first (needed for groupby) + database = database.replace(np.nan, -1) + + # In case multiple runs for the exact same configuration where made: take just the best performing one into account + other_column_names = list(database.columns.values) + other_column_names.remove("time") + database_by_time = database.groupby(other_column_names,) + num_removals = len(database) - len(database_by_time) + if num_removals > 0: + print("[database] Removing %d entries: keeping only those with the lowest execution time" % num_removals) + print("[database] Note: this might take a while") + database = database_by_time.apply(lambda x: x[x["time"] == x["time"].min()]) + + # Re-replace the NaN values + database = database.replace(-1, np.nan) + return database diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py index 3bde33c1..d71e604f 100644 --- a/scripts/database/database/defaults.py +++ b/scripts/database/database/defaults.py @@ -81,16 +81,6 @@ def get_common_best(database, group_name, verbose): # Removes columns without any values database = database.dropna(axis=1, how='all') - database = database.reset_index() - - # In case multiple runs for the exact same configuration where made: take just the best performing one into account - other_column_names = list(database.columns.values) - other_column_names.remove("time") - database_by_time = database.groupby(other_column_names) - if len(database_by_time) != len(database): - if verbose: - print("[database] " + str(group_name) + " keeping only entries with the lowest execution time") - database = database_by_time.apply(lambda x: x[x["time"] == x["time"].min()]) # Inserts the relative execution times into the database def relative_performance(x): |