From e18e9985a9b987e1e328109e58a494d65c31ccb0 Mon Sep 17 00:00:00 2001 From: Alessandro Lorenzi Date: Thu, 4 May 2023 12:17:51 +0200 Subject: [PATCH] fix: NA county_code is valid, not Nan refs: https://github.com/symerio/pgeocode/issues/73 --- pgeocode.py | 36 +++++++++++++++++++++++++++++++++--- test_pgeocode.py | 8 ++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/pgeocode.py b/pgeocode.py index 69f0911..85e3427 100644 --- a/pgeocode.py +++ b/pgeocode.py @@ -141,6 +141,27 @@ "ZA", ] +NA_VALUES = [ + "", + "#N/A", + "#N/A N/A", + "#NA", + "-1.#IND", + "-1.#QNAN", + "-NaN", + "-nan", + "1.#IND", + "1.#QNAN", + "", + "N/A", + # "NA", # NA is a valid county code for Naples, Italy + "NULL", + "NaN", + "n/a", + "nan", + "null", +] + @contextlib.contextmanager def _open_extract_url(url: str, country: str) -> Any: @@ -231,7 +252,12 @@ def _get_data(country: str) -> Tuple[str, pd.DataFrame]: data_path = os.path.join(STORAGE_DIR, country.upper() + ".txt") if os.path.exists(data_path): - data = pd.read_csv(data_path, dtype={"postal_code": str}) + data = pd.read_csv( + data_path, + dtype={"postal_code": str}, + na_values=NA_VALUES, + keep_default_na=False, + ) else: download_urls = [ val.format(country=country) for val in DOWNLOAD_URL @@ -243,10 +269,11 @@ def _get_data(country: str) -> Tuple[str, pd.DataFrame]: header=None, names=DATA_FIELDS, dtype={"postal_code": str}, + na_values=NA_VALUES, + keep_default_na=False, ) os.makedirs(STORAGE_DIR, exist_ok=True) data.to_csv(data_path, index=None) - return data_path, data def _index_postal_codes(self) -> pd.DataFrame: @@ -255,7 +282,10 @@ def _index_postal_codes(self) -> pd.DataFrame: if os.path.exists(data_path_unique): data_unique = pd.read_csv( - data_path_unique, dtype={"postal_code": str} + data_path_unique, + dtype={"postal_code": str}, + na_values=NA_VALUES, + keep_default_na=False, ) else: # group together places with the same postal code diff --git a/test_pgeocode.py b/test_pgeocode.py index 7ba37bb..3fa3c12 100644 --- a/test_pgeocode.py +++ b/test_pgeocode.py @@ -278,6 +278,14 @@ def test_query_location_exact(): assert res["state_name"].unique().tolist() == ["Île-de-France"] +def test_location_naples(): + # https://github.com/symerio/pgeocode/issues/73 + nomi = Nominatim("it") + res = nomi.query_location("Napoli") + assert res["county_name"].unique().tolist() == ["Napoli"] + assert res["county_code"].unique().tolist() == ["NA"] + + def test_query_location_fuzzy(): pytest.importorskip("thefuzz") nomi = Nominatim("fr")