From 29c9f8464c8cb8a85a3aabf6aedb026d20c8dbc1 Mon Sep 17 00:00:00 2001 From: vdshk Date: Sat, 8 May 2021 19:48:43 +0300 Subject: [PATCH] Updated project utils --- meta_table_for_readme.py | 90 ---------------------------------- script.py | 46 ----------------- utils/config.py | 3 ++ utils/fetch_dataset.py | 44 +++++++++++++++++ utils/update_dataset_tables.py | 35 +++++++++++++ 5 files changed, 82 insertions(+), 136 deletions(-) delete mode 100644 meta_table_for_readme.py delete mode 100644 script.py create mode 100644 utils/config.py create mode 100644 utils/fetch_dataset.py create mode 100644 utils/update_dataset_tables.py diff --git a/meta_table_for_readme.py b/meta_table_for_readme.py deleted file mode 100644 index 1de1695b..00000000 --- a/meta_table_for_readme.py +++ /dev/null @@ -1,90 +0,0 @@ -import glob -import json - -from cfpq_data.config import DATASET - - -def rdf_dict(): - data_rdf = dict() - names_rdf = DATASET["RDF"] - for name in names_rdf: - with open(f"./cfpq_data/data/RDF/Graphs/{name}_meta.json", "r") as graph_info: - data_rdf[name] = json.load(graph_info) - return data_rdf - - -def memoryaliases_dict(): - data_memoryaliases = dict() - names_memoryaliases = DATASET["MemoryAliases"] - for name in names_memoryaliases: - name_of_file = glob.glob("./cfpq_data/data/MemoryAliases/Graphs/" + name + "*")[ - 0 - ] - with open(name_of_file, "r") as graph_info: - data_memoryaliases[name] = json.load(graph_info) - return data_memoryaliases - - -def create_table(): - column_names = ["name", "vertices", "edges", "size of file"] - table_rdf = ( - "| Name | Vertices | Edges | Size of file (Bytes) |\n" - + "|:---|:---|:---|:---|\n" - ) - - table_memoryaliases = ( - "| Name | Vertices | Edges | Size of file (Bytes) |\n" - + "|:---|:---|:---|:---|\n" - ) - - rdf = rdf_dict() - memory_aliases = memoryaliases_dict() - names_rdf = DATASET["RDF"] - names_memoryaliases = DATASET["MemoryAliases"] - - for name in sorted(names_rdf, key=lambda x: int(rdf[x]["size of file"])): - for column_name in column_names: - table_rdf += "| " + str(rdf[name][column_name]) + " " - table_rdf += "|\n" - for name in sorted( - names_memoryaliases, key=lambda x: int(memory_aliases[x]["size of file"]) - ): - for column_name in column_names: - table_memoryaliases += "| " + str(memory_aliases[name][column_name]) + " " - table_memoryaliases += "|\n" - - with open("./docs/README.md", "rt") as input_file: - lines = input_file.readlines() - with open("./docs/README.md", "wt") as output_file: - for line in lines: - if "#### RDF" in line: - output_file.write(line) - output_file.write(table_rdf) - output_file.write("\n") - continue - elif "#### MemoryAliases" in line: - output_file.write(line) - output_file.write(table_memoryaliases) - output_file.write("\n") - continue - output_file.write(line) - - -def clean_table(): - flag = 0 - with open("./docs/README.md", "rt") as input_file: - lines = input_file.readlines() - with open("./docs/README.md", "wt") as output_file: - for line in lines: - if "#### RDF" in line or "#### MemoryAliases" in line: - output_file.write(line) - flag = 1 - if "### Reference values" in line: - flag = 0 - if flag == 0: - output_file.write(line) - - -if __name__ == "__main__": - clean_table() - create_table() diff --git a/script.py b/script.py deleted file mode 100644 index b6b65851..00000000 --- a/script.py +++ /dev/null @@ -1,46 +0,0 @@ -import boto3 -import json - - -ACCESS_KEY = "AKIA326NG25W2XT6TBAZ" -SECRET_KEY = "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" - - -def get_dataset(access_key, secret_key): - s3 = boto3.client('s3', - aws_access_key_id=access_key, - aws_secret_access_key=secret_key) - keys = [] - file_name_old = str() - dataset = dict() - answer = dict() - - for key in s3.list_objects(Bucket='cfpq-data')['Contents']: - keys.append(key['Key']) - - for key in keys: - file_name, graph_name = key.split('/') - if file_name != file_name_old and file_name_old != "": - answer[file_name_old] = dataset - dataset = {} - spleeeet = graph_name.split('.') - arch_ext, file_ext = spleeeet[-2] + '.' + spleeeet[-1], spleeeet[-3] - for i in range(3): - spleeeet.pop(-1) - if spleeeet[-1] == 'txt': - spleeeet.pop(-1) - graph_name = ".".join(spleeeet) - tmp = dict() - tmp['version_id'] = s3.head_object(Bucket="cfpq-data", Key=key)['VersionId'] - tmp['file_extension'] = "." + file_ext - tmp['archive_extension'] = "." + arch_ext - dataset[graph_name] = tmp - file_name_old = file_name - - answer[file_name_old] = dataset - return answer - - -if __name__ == "__main__": - with open('config.json', 'w') as file: - json.dump(get_dataset(ACCESS_KEY, SECRET_KEY), file, indent=4) diff --git a/utils/config.py b/utils/config.py new file mode 100644 index 00000000..99b197c2 --- /dev/null +++ b/utils/config.py @@ -0,0 +1,3 @@ +from pathlib import Path + +MAIN_FOLDER = Path(__file__).parent.parent diff --git a/utils/fetch_dataset.py b/utils/fetch_dataset.py new file mode 100644 index 00000000..7743c1a4 --- /dev/null +++ b/utils/fetch_dataset.py @@ -0,0 +1,44 @@ +from collections import defaultdict +from json import dumps + +from boto3 import client + +from cfpq_data import __version__ as cfpq_data_version +from cfpq_data.config import AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME +from config import MAIN_FOLDER + + +def fetch_dataset(): + s3 = client( + "s3", + aws_access_key_id=AWS_ACCESS_KEY_ID, + aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + ) + + dataset = defaultdict(dict) + + for graph in s3.list_objects(Bucket="cfpq-data", Prefix=cfpq_data_version)[ + "Contents" + ]: + graph_key = graph["Key"] + graph_class, graph_full_name = graph_key.split("/")[1:] + graph_name = graph_full_name.split(".")[0] + graph_file_extension = "." + graph_full_name.split(".")[1] + graph_archive_extension = graph_full_name.split(graph_file_extension)[1] + dataset[graph_class][graph_name] = { + "VersionId": s3.head_object(Bucket=BUCKET_NAME, Key=graph_key)["VersionId"], + "FileExtension": graph_file_extension, + "ArchiveExtension": graph_archive_extension, + } + + return dataset + + +def update_dataset(dataset): + with open(MAIN_FOLDER / "cfpq_data" / "dataset.py", "w") as fout: + fout.write("dataset = " + dumps(dataset, indent=4)) + + +if __name__ == "__main__": + dataset = fetch_dataset() + update_dataset(dataset) diff --git a/utils/update_dataset_tables.py b/utils/update_dataset_tables.py new file mode 100644 index 00000000..0c5b3002 --- /dev/null +++ b/utils/update_dataset_tables.py @@ -0,0 +1,35 @@ +from csv import DictWriter + +from cfpq_data import graph_from_dataset +from config import MAIN_FOLDER +from fetch_dataset import fetch_dataset + + +def update_dataset_tables(dataset): + for graph_class in dataset.keys(): + fieldnames = ["Graph", "#Vertices", "#Edges"] + with open( + MAIN_FOLDER / "docs" / "dataset" / f"{graph_class}.csv", mode="w" + ) as csv_file: + csv_writer = DictWriter(csv_file, fieldnames=fieldnames) + csv_writer.writeheader() + + for graph_name in dataset[graph_class]: + graph = graph_from_dataset(graph_name) + csv_writer.writerow( + dict( + zip( + fieldnames, + [ + graph_name, + f"{graph.number_of_nodes():,}", + f"{graph.number_of_edges():,}", + ], + ) + ) + ) + + +if __name__ == "__main__": + dataset = fetch_dataset() + update_dataset_tables(dataset)