From 7979dd044b1dda2e4cfdf29cfed298435e5b374f Mon Sep 17 00:00:00 2001 From: vdshk Date: Sat, 8 May 2021 19:52:26 +0300 Subject: [PATCH] Migrate graphs to AWS S3 --- .gitignore | 7 + cfpq_data/__init__.py | 5 +- cfpq_data/config.py | 217 ++---------------------------- cfpq_data/dataset.py | 206 ++++++++++++++++++++++++++++ cfpq_data/graphs/readwrite/rdf.py | 105 ++++++++++----- 5 files changed, 298 insertions(+), 242 deletions(-) create mode 100644 cfpq_data/dataset.py diff --git a/.gitignore b/.gitignore index 86794fc5..07d3255a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ # IDEA .idea +# Data +cfpq_data/data + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -50,6 +53,10 @@ coverage.xml .hypothesis/ .pytest_cache/ +# Doctest files +test.txt +test.xml + # Translations *.mo *.pot diff --git a/cfpq_data/__init__.py b/cfpq_data/__init__.py index f77b8973..6e1ba5f2 100644 --- a/cfpq_data/__init__.py +++ b/cfpq_data/__init__.py @@ -7,7 +7,10 @@ experimental analysis of context-free path querying algorithms """ -__version__ = "0.0.0" +__version__ = "1.0.0" + +import cfpq_data.config +from cfpq_data.config import * import cfpq_data.graphs from cfpq_data.graphs import * diff --git a/cfpq_data/config.py b/cfpq_data/config.py index 22db51f6..a7c75a6a 100644 --- a/cfpq_data/config.py +++ b/cfpq_data/config.py @@ -1,210 +1,15 @@ from pathlib import Path +__all__ = [ + "MAIN_FOLDER", + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "BUCKET_NAME", +] + MAIN_FOLDER = Path(__file__).parent -DATASET = { - "MemoryAliases": { - "Apache_httpd_2.2.18_pointsto_graph": { - "version_id": "6CfRViYHGLyYvSiKf8kWhwOQpFUvUhrw", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "PostgreSQL_8.3.9_pointsto_graph": { - "version_id": "dlVensEsC3.Tjn3MZipEgfCXTyFc9Gno", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "arch_afterInline": { - "version_id": "oYn672Yd85LFkdAVsW3ppJDAXcvTWEsq", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "block_afterInline": { - "version_id": "yjSfyx3ND7LiqlTZcyk3WQDGO.bdmTed", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "bzip2": { - "version_id": "_V9sUGA3HzqgF22dH34QjMq5wbO48oVL", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "crypto_afterInline": { - "version_id": "ikRlpHT_E5e8iutdRRwyiadm6Y3LB2DS", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "drivers_afterInline": { - "version_id": "Wwlv0L6vKWvPYttwLNWfGESPN6wPCnP.", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "fs_afterInline": { - "version_id": "MkdtntPJDg2zH5EYLPvgbZe5y.x.FAyv", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "gzip": { - "version_id": "Qq7CIGhL6TZvfsOhT2ou64mR9HoxwZjz", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "init_afterInline": { - "version_id": "Qb48SAVUqCy3PnQ5tPGesC4jXtw93kpy", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "ipc_afterInline": { - "version_id": "Hg3IDlRfPYHdXPBfdSib.N7TwPYl9n1S", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "kernel_afterInline": { - "version_id": "hDJdX1VgrfoEzJzQCIcCfa4aTRsttVHN", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "lib_afterInline": { - "version_id": "eOLlCHTuaqG952PnkV1UBEhc0ZInJ7C7", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "ls": { - "version_id": "joxt5Ofql.T9BGD9DzGamhY4w1Dp3Of2", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "mm_afterInline": { - "version_id": "h3zNK21CY54316eQcay6fgsp5QJCnOgs", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "net_afterInline": { - "version_id": "4RT6k1W58sd6GciF2wlk4kDaCD3RPBJ6", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "pr": { - "version_id": "7JVaerYb4Xr6Pee4qU8qJWVKyUlJ5rVr", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "security_afterInline": { - "version_id": "ptv8rwCAPdHuo3ABJpvnr92.pQPTU.6C", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "sound_afterInline": { - "version_id": "C2b1j6J73_8tx6pZNVLtTDyhq8bAW5Zn", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - }, - "wc": { - "version_id": "wvpVN9XAqfMvkAO44YYXC5L4vKP3lJLM", - "file_extension": ".xml", - "archive_extension": ".tar.gz" - } - }, - "RDF": { - "atom-primitive": { - "version_id": "d.KnhyNlOpor7YAnu6MCOefbTQmDAWyy", - "file_extension": ".owl", - "archive_extension": ".tar.gz" - }, - "biomedical-mesure-primitive": { - "version_id": "gz33IPh2tGDGI27Z3mIdBBdNYKyOg_.Y", - "file_extension": ".owl", - "archive_extension": ".tar.gz" - }, - "core": { - "version_id": "Uztj4uXtJMtW92Tpi3bnBQ.GR_qBSmlp", - "file_extension": ".owl", - "archive_extension": ".tar.gz" - }, - "eclass_514en": { - "version_id": "JMWYihKJ463hxx_c6AXT4RjDTk0HlXlD", - "file_extension": ".owl", - "archive_extension": ".tar.gz" - }, - "enzyme": { - "version_id": "dD7wbg9FtbuxYQLlmVsnEWGSFSdXhegZ", - "file_extension": ".rdf", - "archive_extension": ".tar.gz" - }, - "foaf": { - "version_id": "Z6I8ecv_QvvEQRM8V4zOVODq1n_UW8AR", - "file_extension": ".rdf", - "archive_extension": ".tar.gz" - }, - "funding": { - "version_id": "gdYtwOO4UfJYNjkjVffwneIe2NHSlpDK", - "file_extension": ".rdf", - "archive_extension": ".tar.gz" - }, - "generations": { - "version_id": "WYijOVKMwCxlJhGO1_7KPYbvumMgABTv", - "file_extension": ".owl", - "archive_extension": ".tar.gz" - }, - "geospecies": { - "version_id": "lw9qej02UyFuHkcBmQ_7RIXDajisTK1P", - "file_extension": ".rdf", - "archive_extension": ".tar.gz" - }, - "go-hierarchy": { - "version_id": "8Sx964vDBw8plJh8uVR028MD.Hm8SyfQ", - "file_extension": ".owl", - "archive_extension": ".tar.gz" - }, - "go": { - "version_id": "g3htMbp43uGbexRrbBO4mzHGeYVDBkR3", - "file_extension": ".owl", - "archive_extension": ".tar.gz" - }, - "pathways": { - "version_id": "ZpMYFCBHEftavyxjFWOFnvoLkj6wblcJ", - "file_extension": ".rdf", - "archive_extension": ".tar.gz" - }, - "people_pets": { - "version_id": "NIbwB0y9iGpKQ.eXVZ6qLqvTgon3Czps", - "file_extension": ".rdf", - "archive_extension": ".tar.gz" - }, - "pizza": { - "version_id": "ARc81JDYzyL0MuO_9f74gn6VjfNZh9FF", - "file_extension": ".owl", - "archive_extension": ".tar.gz" - }, - "skos": { - "version_id": "sYzQ5wxgKcGOw5MqXPqojJBwJKRwOKcr", - "file_extension": ".rdf", - "archive_extension": ".tar.gz" - }, - "taxonomy-hierarchy": { - "version_id": "UW4M3aO4mx72w_Yq0g6106wpfjGVU9ci", - "file_extension": ".rdf", - "archive_extension": ".tar.gz" - }, - "taxonomy": { - "version_id": "i_ZqYKddbnqtHkk2HD.CX1Ip8YMtgFhQ", - "file_extension": ".rdf", - "archive_extension": ".tar.gz" - }, - "travel": { - "version_id": "k7O7zm3_Mj4ZT3BjOHbrPiKm3Yjj0HOB", - "file_extension": ".owl", - "archive_extension": ".tar.gz" - }, - "univ-bench": { - "version_id": "bt6cu6_XOImhe5wgSXeNFJIZv3il7.ry", - "file_extension": ".owl", - "archive_extension": ".tar.gz" - }, - "wine": { - "version_id": "e.1EQN4f_Z1lN8idPAaUnpxO68_AMbTR", - "file_extension": ".rdf", - "archive_extension": ".tar.gz" - } - } -} +AWS_ACCESS_KEY_ID = "AKIA326NG25W2XT6TBAZ" +AWS_SECRET_ACCESS_KEY = "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" + +BUCKET_NAME = "cfpq-data" diff --git a/cfpq_data/dataset.py b/cfpq_data/dataset.py new file mode 100644 index 00000000..9c35a60f --- /dev/null +++ b/cfpq_data/dataset.py @@ -0,0 +1,206 @@ +dataset = { + "MemoryAliases": { + "apache_httpd_2_2_18_points_to_graph": { + "VersionId": "oadvuyls8oTIWOq7wzd8KB2Qg1nxNNwB", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "arch_after_inline": { + "VersionId": "nxlkF9irtr7ZKhMC1ckC_Rchh8L6uNDK", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "block_after_inline": { + "VersionId": "iKk0kH6fdHyORZ_SfherVaoYPmc_K1VR", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "bzip2": { + "VersionId": "mZhHC39FGfteYYXrKPLeAT4d3m_CBtxM", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "crypto_after_inline": { + "VersionId": "9BRcFw114dLzqfGddfLqxEEhq.6rZWst", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "drivers_after_inline": { + "VersionId": "bm9U88PTDUrGOllB.CiHsEuoNTQ3zKZK", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "fs_after_inline": { + "VersionId": "1_bPDmotwN0brEiLFWNeX1daJvAcD0mz", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "gzip": { + "VersionId": "zL4SjMI5Ib5bOAjDIQVd47dUMMDndQ9N", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "init_after_inline": { + "VersionId": "gkFMwF0MVuSZkd8MmVCegDn.mI0CPb1_", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "ipc_after_inline": { + "VersionId": "lhZqCmCRJdyLHpF1wQp4_lR1rvk1FuB3", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "kernel_after_inline": { + "VersionId": "eqlI6ESw9ewbycaBOZ1mGi5pz1xf9xl2", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "lib_after_inline": { + "VersionId": "c.xojy035QPIqiyLqX5TByqXxLiEKa.E", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "ls": { + "VersionId": "EBfeenQofRUm_I_xBZpFwvIgoYamMRVq", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "mm_after_inline": { + "VersionId": "1ncZn5Vh.s9YNVXTmzV1E1adwWJn.7TO", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "net_after_inline": { + "VersionId": "8qR2sAhFefh9o_76RR_CgwfypYaqv1ju", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "postgre_sql_8_3_9_points_to_graph": { + "VersionId": "JuhDvkYmHEQqbjFgkuGQyZ9rohOJHieY", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "pr": { + "VersionId": "5p_ypate_ISXT9rqoJnNnwy303rWGBNy", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "security_after_inline": { + "VersionId": "y8JqKEnpFDgQTJDTsHiSr4Y1aFB9ivv9", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "sound_after_inline": { + "VersionId": "0R73D6WIkeGfDUJlDFwvIKy7FA1qCKUx", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + "wc": { + "VersionId": "acCYB7cmAZZN.KnzCMNmYftlXmE17JAt", + "FileExtension": ".xml", + "ArchiveExtension": ".tar.gz", + }, + }, + "RDF": { + "atom_primitive": { + "VersionId": "Uc2bI_8pILKwnxsFMzVfyozYPUPmVLAK", + "FileExtension": ".owl", + "ArchiveExtension": ".tar.gz", + }, + "biomedical_mesure_primitive": { + "VersionId": "sQV0TqhOEgJB5IEaW_bXcLZqvpA..OgH", + "FileExtension": ".owl", + "ArchiveExtension": ".tar.gz", + }, + "core": { + "VersionId": "LheS.Fjug8P3pw34G7i2lkwuczLXNf_H", + "FileExtension": ".owl", + "ArchiveExtension": ".tar.gz", + }, + "eclass_514en": { + "VersionId": "JiWcg7kQM8u_6uyJIifMcIcb24fEHyW1", + "FileExtension": ".owl", + "ArchiveExtension": ".tar.gz", + }, + "enzyme": { + "VersionId": "azuj7Guix00W2Z9J1jMUMZ0b4fdQBVAp", + "FileExtension": ".rdf", + "ArchiveExtension": ".tar.gz", + }, + "foaf": { + "VersionId": "s3cjk6QG0EZOUTwthCO2uW8KM1L93JoL", + "FileExtension": ".rdf", + "ArchiveExtension": ".tar.gz", + }, + "funding": { + "VersionId": "sJudEAnrvx02mG5WvICUFY251zHKuWr3", + "FileExtension": ".rdf", + "ArchiveExtension": ".tar.gz", + }, + "generations": { + "VersionId": "Ty_YXmWZUTKkwCDGlHUTH_04m4Z8ai.m", + "FileExtension": ".owl", + "ArchiveExtension": ".tar.gz", + }, + "geospecies": { + "VersionId": "Il9rK9c2nmED0mU43zJRRnkP9ZsWoj6J", + "FileExtension": ".rdf", + "ArchiveExtension": ".tar.gz", + }, + "go": { + "VersionId": "KgBCTx.2aE3Z83UIaZfJsNETSOd3AnW4", + "FileExtension": ".owl", + "ArchiveExtension": ".tar.gz", + }, + "go_hierarchy": { + "VersionId": "bfzjAo0hIiKl3P.5j0Di6qnZ91Crk2oM", + "FileExtension": ".owl", + "ArchiveExtension": ".tar.gz", + }, + "pathways": { + "VersionId": "7yKgDvVNzNRxm1kShWj3k0gPLVfDKUoY", + "FileExtension": ".rdf", + "ArchiveExtension": ".tar.gz", + }, + "people_pets": { + "VersionId": "TQiYCf4FTKNXT1oe9i4z6EF8jrgOhZU6", + "FileExtension": ".rdf", + "ArchiveExtension": ".tar.gz", + }, + "pizza": { + "VersionId": "LI_T9sF_SfPXoHlqxpEgtm8nM46Ks.oQ", + "FileExtension": ".owl", + "ArchiveExtension": ".tar.gz", + }, + "skos": { + "VersionId": "jdL0akCVzz6G1f9IHOq4BNmjELqumQbY", + "FileExtension": ".rdf", + "ArchiveExtension": ".tar.gz", + }, + "taxonomy": { + "VersionId": "ANVa_sG_P5f12mF4cJMjYouckY1FP5X5", + "FileExtension": ".rdf", + "ArchiveExtension": ".tar.gz", + }, + "taxonomy_hierarchy": { + "VersionId": "bg6Q8GujqnT8u68.puOZkVpccPA_otGT", + "FileExtension": ".rdf", + "ArchiveExtension": ".tar.gz", + }, + "travel": { + "VersionId": "TeqOLYscWX5xiOxAm9meDO0euCMn1RX0", + "FileExtension": ".owl", + "ArchiveExtension": ".tar.gz", + }, + "univ_bench": { + "VersionId": "FIZZTKzl2TgowE8clE6GK2HB0LDGQ3UA", + "FileExtension": ".owl", + "ArchiveExtension": ".tar.gz", + }, + "wine": { + "VersionId": "Cf2lrkO4v7o4YYTOaCuXLODXc3caYKez", + "FileExtension": ".rdf", + "ArchiveExtension": ".tar.gz", + }, + }, +} diff --git a/cfpq_data/graphs/readwrite/rdf.py b/cfpq_data/graphs/readwrite/rdf.py index 3660132b..d1c09961 100644 --- a/cfpq_data/graphs/readwrite/rdf.py +++ b/cfpq_data/graphs/readwrite/rdf.py @@ -1,15 +1,23 @@ """Read (and write) a graph from (and to) RDF file. """ +from os import path, remove from pathlib import Path +from shutil import unpack_archive from typing import Union +from boto3 import client from networkx import MultiDiGraph from rdflib import Graph as RDFGraph, BNode, URIRef, Literal, XSD -from cfpq_data.config import DATASET -from cfpq_data.utils import download_data -from cfpq_data.utils import unpack_graph +from cfpq_data import __version__ as VERSION +from cfpq_data.config import ( + AWS_ACCESS_KEY_ID, + AWS_SECRET_ACCESS_KEY, + MAIN_FOLDER, + BUCKET_NAME, +) +from cfpq_data.dataset import dataset as DATASET __all__ = [ "graph_from_dataset", @@ -18,6 +26,64 @@ ] +def graph_from_dataset(graph_name: str) -> MultiDiGraph: + """Returns a graph from + an RDF file loaded from + a dataset by name. + + Parameters + ---------- + graph_name : str + The name of the graph from the dataset. + + Examples + -------- + >>> import cfpq_data + >>> generations = cfpq_data.graph_from_dataset("generations") + >>> generations.number_of_nodes() + 129 + >>> generations.number_of_edges() + 273 + + Returns + ------- + g : MultiDiGraph + Loaded graph. + """ + for graph_class in DATASET.keys(): + if graph_name in DATASET[graph_class].keys(): + dst = MAIN_FOLDER / "data" / graph_class / "Graphs" + dst.mkdir(parents=True, exist_ok=True) + graph_file = graph_name + DATASET[graph_class][graph_name]["FileExtension"] + graph_file_path = str(dst / graph_file) + + if not path.isfile(graph_file_path): + graph_archive = ( + graph_file + DATASET[graph_class][graph_name]["ArchiveExtension"] + ) + graph_archive_path = str(dst / graph_archive) + + s3 = client( + "s3", + aws_access_key_id=AWS_ACCESS_KEY_ID, + aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + ) + s3.download_file( + Bucket=BUCKET_NAME, + Key=f"{VERSION}/{graph_class}/{graph_archive}", + Filename=graph_archive_path, + ExtraArgs={ + "VersionId": DATASET[graph_class][graph_name]["VersionId"], + }, + ) + + unpack_archive(graph_archive_path, dst) + + remove(graph_archive_path) + + return graph_from_rdf(graph_file_path) + + def graph_from_rdf(source: Union[Path, str]) -> MultiDiGraph: """Returns a graph from RDF file. @@ -53,37 +119,6 @@ def graph_from_rdf(source: Union[Path, str]) -> MultiDiGraph: return g -def graph_from_dataset(source: str) -> MultiDiGraph: - """Returns a graph from - an RDF file loaded from - a dataset by name. - - Parameters - ---------- - source : str - The name of the graph from the dataset. - - Examples - -------- - >>> import cfpq_data - >>> generations = cfpq_data.graph_from_dataset("generations") - >>> generations.number_of_nodes() - 129 - >>> generations.number_of_edges() - 273 - - Returns - ------- - g : MultiDiGraph - Loaded graph. - """ - for cls_name in DATASET.keys(): - if source in DATASET[cls_name].keys(): - download_data(cls_name, source, DATASET[cls_name][source]) - path_to_rdf = unpack_graph(cls_name, source) - return graph_from_rdf(path_to_rdf) - - def graph_to_rdf(graph: MultiDiGraph, path: Union[Path, str]) -> Path: """Returns the path to the RDF file where the graph will be saved. @@ -99,7 +134,7 @@ def graph_to_rdf(graph: MultiDiGraph, path: Union[Path, str]) -> Path: Examples -------- >>> import cfpq_data - >>> g = cfpq_data.graph_from_dataset("univ-bench") + >>> g = cfpq_data.graph_from_dataset("generations") >>> path = cfpq_data.graph_to_rdf(g, "test.xml") Returns