diff --git a/.circleci/config.yml b/.circleci/config.yml index 463667446ed42..4acc6473e6add 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -141,11 +141,9 @@ workflows: only: /^v.*/ matrix: parameters: - cibw-build: ["cp39-manylinux_aarch64", - "cp310-manylinux_aarch64", + cibw-build: ["cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64", - "cp39-musllinux_aarch64", "cp310-musllinux_aarch64", "cp311-musllinux_aarch64", "cp312-musllinux_aarch64",] diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 63f687324b0ae..460ae2f8594c0 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -4,12 +4,6 @@ inputs: editable: description: Whether to build pandas in editable mode (default true) default: true - meson_args: - description: Extra flags to pass to meson - required: false - cflags_adds: - description: Items to append to the CFLAGS variable - required: false runs: using: composite steps: @@ -30,12 +24,11 @@ runs: - name: Build Pandas run: | - export CFLAGS="$CFLAGS ${{ inputs.cflags_adds }}" if [[ ${{ inputs.editable }} == "true" ]]; then - pip install -e . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ + pip install -e . --no-build-isolation -v --no-deps \ --config-settings=setup-args="--werror" else - pip install . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ + pip install . --no-build-isolation -v --no-deps \ --config-settings=setup-args="--werror" fi shell: bash -el {0} diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index 66e4142dc0cbb..f5d6abdf0f186 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -7,14 +7,14 @@ runs: shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Test results path: test-data.xml if: failure() - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: flags: unittests name: codecov-pandas diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 2de1649d42dfd..97f90c1588962 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -53,7 +53,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python-version: ['3.9', '3.10', '3.11'] + python-version: ['3.10', '3.11'] fail-fast: false name: Test Conda Forge Recipe - Python ${{ matrix.python-version }} concurrency: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index aaa6153c04615..ddb6ecbe83126 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -26,7 +26,7 @@ jobs: timeout-minutes: 90 strategy: matrix: - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] include: @@ -35,7 +35,7 @@ jobs: pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" - name: "Minimum Versions" - env_file: actions-39-minimum_versions.yaml + env_file: actions-310-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" - name: "Locale: it_IT" env_file: actions-311.yaml @@ -74,9 +74,9 @@ jobs: PATTERN: ${{ matrix.pattern }} LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} - PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + PANDAS_CI: '1' TEST_ARGS: ${{ matrix.test_args || '' }} - PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} + PYTEST_WORKERS: 'auto' PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} # Clipboard tests @@ -88,7 +88,7 @@ jobs: services: mysql: - image: mysql:8.0.33 + image: mysql:8 env: MYSQL_ALLOW_EMPTY_PASSWORD: yes MYSQL_DATABASE: pandas @@ -101,7 +101,7 @@ jobs: - 3306:3306 postgres: - image: postgres:13 + image: postgres:16 env: PGUSER: postgres POSTGRES_USER: postgres @@ -116,7 +116,7 @@ jobs: - 5432:5432 moto: - image: motoserver/moto:4.1.13 + image: motoserver/moto:5.0.0 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret @@ -146,9 +146,8 @@ jobs: - name: Build Pandas id: build uses: ./.github/actions/build_pandas - with: - meson_args: ${{ matrix.meson_args }} - cflags_adds: ${{ matrix.cflags_adds }} + # TODO: Re-enable once Pypy has Pypy 3.10 on conda-forge + if: ${{ matrix.name != 'Pypy' }} - name: Test (not single_cpu) uses: ./.github/actions/run-tests @@ -170,7 +169,7 @@ jobs: matrix: # Note: Don't use macOS latest since macos 14 appears to be arm64 only os: [macos-13, macos-14, windows-latest] - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] fail-fast: false runs-on: ${{ matrix.os }} name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} @@ -300,7 +299,7 @@ jobs: # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs # to the corresponding posix/windows-macos/sdist etc. workflows. # Feel free to modify this comment as necessary. - if: false # Uncomment this to freeze the workflow, comment it to unfreeze + # if: false # Uncomment this to freeze the workflow, comment it to unfreeze defaults: run: shell: bash -eou pipefail {0} @@ -315,7 +314,7 @@ jobs: concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-dev cancel-in-progress: true env: @@ -332,7 +331,7 @@ jobs: - name: Set up Python Dev Version uses: actions/setup-python@v5 with: - python-version: '3.12-dev' + python-version: '3.13-dev' - name: Build Environment run: | @@ -347,6 +346,51 @@ jobs: - name: Run Tests uses: ./.github/actions/run-tests + python-freethreading: + defaults: + run: + shell: bash -eou pipefail {0} + runs-on: ubuntu-22.04 + + timeout-minutes: 90 + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-freethreading-dev + cancel-in-progress: true + + env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard and not single_cpu" + PYTEST_TARGET: pandas + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python Free-threading Version + uses: deadsnakes/action@v3.1.0 + with: + python-version: 3.13-dev + nogil: true + + - name: Build Environment + run: | + python --version + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython + python -m pip install versioneer[toml] + python -m pip install python-dateutil pytz tzdata hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov + python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" + python -m pip list + + - name: Run Tests + uses: ./.github/actions/run-tests + env: + PYTHON_GIL: 0 + # NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml emscripten: # Note: the Python version, Emscripten toolchain version are determined diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 2403574d94681..260a2f3774fbb 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -99,7 +99,7 @@ jobs: - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] + python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -140,7 +140,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.18.1 + uses: pypa/cibuildwheel@v2.19.2 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf88500b10524..b81b9ba070a44 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.7 + rev: v0.5.0 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -67,15 +67,16 @@ repos: - id: fix-encoding-pragma args: [--remove] - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort rev: 5.13.2 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.15.2 + rev: v3.16.0 hooks: - id: pyupgrade - args: [--py39-plus] + args: [--py310-plus] - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 hooks: @@ -92,7 +93,7 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.5 + rev: v18.1.8 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 15e691d46f693..b2495356f134c 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -546,24 +546,17 @@ def time_chained_indexing(self, mode): class Block: - params = [ - (True, "True"), - (np.array(True), "np.array(True)"), - ] - - def setup(self, true_value, mode): + def setup(self): self.df = DataFrame( False, columns=np.arange(500).astype(str), index=date_range("2010-01-01", "2011-01-01"), ) - self.true_value = true_value - - def time_test(self, true_value, mode): + def time_test(self): start = datetime(2010, 5, 1) end = datetime(2010, 9, 1) - self.df.loc[start:end, :] = true_value + self.df.loc[start:end, :] = True from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/tslibs/timedelta.py b/asv_bench/benchmarks/tslibs/timedelta.py index dcc73aefc6c7a..9d9689fcfa94b 100644 --- a/asv_bench/benchmarks/tslibs/timedelta.py +++ b/asv_bench/benchmarks/tslibs/timedelta.py @@ -20,7 +20,7 @@ def time_from_int(self): Timedelta(123456789) def time_from_unit(self): - Timedelta(1, unit="d") + Timedelta(1, unit="D") def time_from_components(self): Timedelta( diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py index 082220ee0dff2..6145966fb6a0e 100644 --- a/asv_bench/benchmarks/tslibs/timestamp.py +++ b/asv_bench/benchmarks/tslibs/timestamp.py @@ -1,7 +1,10 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) +import zoneinfo import numpy as np -import pytz from pandas import Timestamp @@ -12,7 +15,7 @@ class TimestampConstruction: def setup(self): self.npdatetime64 = np.datetime64("2020-01-01 00:00:00") self.dttime_unaware = datetime(2020, 1, 1, 0, 0, 0) - self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, pytz.UTC) + self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, timezone.utc) self.ts = Timestamp("2020-01-01 00:00:00") def time_parse_iso8601_no_tz(self): @@ -113,7 +116,7 @@ def setup(self, tz): self.ts = Timestamp("2017-08-25 08:16:14", tz=tz) def time_replace_tz(self, tz): - self.ts.replace(tzinfo=pytz.timezone("US/Eastern")) + self.ts.replace(tzinfo=zoneinfo.ZoneInfo("US/Eastern")) def time_replace_None(self, tz): self.ts.replace(tzinfo=None) @@ -144,8 +147,8 @@ def time_ceil(self, tz): class TimestampAcrossDst: def setup(self): - dt = datetime(2016, 3, 27, 1) - self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + dt = datetime(2016, 3, 27, 1, fold=0) + self.tzinfo = dt.astimezone(zoneinfo.ZoneInfo("Europe/Berlin")).tzinfo self.ts2 = Timestamp(dt) def time_replace_across_dst(self): diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index 4a011d4bb3f06..885cf48d01743 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -20,13 +20,13 @@ timedelta, timezone, ) +import zoneinfo from dateutil.tz import ( gettz, tzlocal, ) import numpy as np -import pytz try: from pandas._libs.tslibs import ints_to_pydatetime @@ -38,7 +38,7 @@ None, timezone.utc, timezone(timedelta(minutes=60)), - pytz.timezone("US/Pacific"), + zoneinfo.ZoneInfo("US/Pacific"), gettz("Asia/Tokyo"), tzlocal_obj, ] diff --git a/asv_bench/benchmarks/tslibs/tz_convert.py b/asv_bench/benchmarks/tslibs/tz_convert.py index c6b510efdca69..c87adb5e5d0e9 100644 --- a/asv_bench/benchmarks/tslibs/tz_convert.py +++ b/asv_bench/benchmarks/tslibs/tz_convert.py @@ -1,5 +1,6 @@ +from datetime import timezone + import numpy as np -from pytz import UTC from pandas._libs.tslibs.tzconversion import tz_localize_to_utc @@ -41,7 +42,7 @@ def time_tz_convert_from_utc(self, size, tz): # dti = DatetimeIndex(self.i8data, tz=tz) # dti.tz_localize(None) if old_sig: - tz_convert_from_utc(self.i8data, UTC, tz) + tz_convert_from_utc(self.i8data, timezone.utc, tz) else: tz_convert_from_utc(self.i8data, tz) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 188f5678bbbba..b01866a6d6c82 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.get_loc_level PR07" \ -i "pandas.MultiIndex.levshape SA01" \ -i "pandas.MultiIndex.names SA01" \ - -i "pandas.MultiIndex.nlevels SA01" \ -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ -i "pandas.MultiIndex.set_levels RT03,SA01" \ @@ -92,12 +91,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.to_frame RT03" \ -i "pandas.NA SA01" \ -i "pandas.NaT SA01" \ - -i "pandas.NamedAgg SA01" \ -i "pandas.Period.asfreq SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.freqstr SA01" \ -i "pandas.Period.month SA01" \ - -i "pandas.Period.now SA01" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.Period.strftime PR01,SA01" \ -i "pandas.Period.to_timestamp SA01" \ @@ -156,7 +153,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_convert PR01,PR02" \ -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ - -i "pandas.Series.ge SA01" \ -i "pandas.Series.gt SA01" \ -i "pandas.Series.list.__getitem__ SA01" \ -i "pandas.Series.list.flatten SA01" \ @@ -165,7 +161,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.ne SA01" \ -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Series.plot PR02" \ - -i "pandas.Series.pop RT03,SA01" \ + -i "pandas.Series.pop SA01" \ -i "pandas.Series.prod RT03" \ -i "pandas.Series.product RT03" \ -i "pandas.Series.reorder_levels RT03,SA01" \ @@ -212,11 +208,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.to_frame SA01" \ -i "pandas.Series.to_markdown SA01" \ -i "pandas.Series.update PR07,SA01" \ - -i "pandas.Timedelta.as_unit SA01" \ -i "pandas.Timedelta.asm8 SA01" \ -i "pandas.Timedelta.ceil SA01" \ -i "pandas.Timedelta.components SA01" \ - -i "pandas.Timedelta.days SA01" \ -i "pandas.Timedelta.floor SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ @@ -236,7 +230,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.ctime SA01" \ -i "pandas.Timestamp.date SA01" \ -i "pandas.Timestamp.day GL08" \ - -i "pandas.Timestamp.floor SA01" \ -i "pandas.Timestamp.fold GL08" \ -i "pandas.Timestamp.fromordinal SA01" \ -i "pandas.Timestamp.fromtimestamp PR01,SA01" \ @@ -250,13 +243,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.month_name SA01" \ -i "pandas.Timestamp.nanosecond GL08" \ -i "pandas.Timestamp.normalize SA01" \ - -i "pandas.Timestamp.now SA01" \ -i "pandas.Timestamp.quarter SA01" \ -i "pandas.Timestamp.replace PR07,SA01" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.second GL08" \ -i "pandas.Timestamp.strptime PR01,SA01" \ - -i "pandas.Timestamp.time SA01" \ -i "pandas.Timestamp.timestamp SA01" \ -i "pandas.Timestamp.timetuple SA01" \ -i "pandas.Timestamp.timetz SA01" \ @@ -266,7 +257,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.to_period PR01,SA01" \ -i "pandas.Timestamp.today SA01" \ -i "pandas.Timestamp.toordinal SA01" \ - -i "pandas.Timestamp.tz SA01" \ -i "pandas.Timestamp.tz_localize SA01" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.tzname SA01" \ @@ -465,8 +455,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ - -i "pandas.merge PR07" \ - -i "pandas.merge_asof PR07,RT03" \ -i "pandas.period_range RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml similarity index 98% rename from ci/deps/actions-39-minimum_versions.yaml rename to ci/deps/actions-310-minimum_versions.yaml index b760f27a3d4d3..a9c205d24d212 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -4,7 +4,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.9 + - python=3.10 # build dependencies - versioneer[toml] diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index d84063ac2a9ba..5455b9b84b034 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -18,7 +18,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz - pip diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml deleted file mode 100644 index 8f235a836bb3d..0000000000000 --- a/ci/deps/actions-39.yaml +++ /dev/null @@ -1,63 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.9 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - - pip: - - adbc-driver-postgresql>=0.10.0 - - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 - - pytest-localserver>=0.7.1 diff --git a/doc/data/titanic.csv b/doc/data/titanic.csv index 5cc466e97cf12..0f7d184728a17 100644 --- a/doc/data/titanic.csv +++ b/doc/data/titanic.csv @@ -1,93 +1,93 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S 2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C -3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S +3,1,3,"Heikkinen, Miss Laina",female,26,0,0,STON/O2. 3101282,7.925,,S 4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S 5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S 6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q 7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S -8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S +8,0,3,"Palsson, Master Gosta Leonard",male,2,3,1,349909,21.075,,S 9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S 10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C -11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S -12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S +11,1,3,"Sandstrom, Miss Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S +12,1,1,"Bonnell, Miss Elizabeth",female,58,0,0,113783,26.55,C103,S 13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S 14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S -15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S +15,0,3,"Vestrom, Miss Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S 16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S -17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q +17,0,3,"Rice, Master Eugene",male,2,4,1,382652,29.125,,Q 18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S 19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S 20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C 21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S 22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S -23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q +23,1,3,"McGowan, Miss Anna ""Annie""",female,15,0,0,330923,8.0292,,Q 24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S -25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S +25,0,3,"Palsson, Miss Torborg Danira",female,8,3,1,349909,21.075,,S 26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S 27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C 28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S -29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q +29,1,3,"O'Dwyer, Miss Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q 30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S 31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C 32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C -33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q +33,1,3,"Glynn, Miss Mary Agatha",female,,0,0,335677,7.75,,Q 34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S 35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C 36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S 37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C 38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S -39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S -40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C +39,0,3,"Vander Planke, Miss Augusta Maria",female,18,2,0,345764,18,,S +40,1,3,"Nicola-Yarred, Miss Jamila",female,14,1,0,2651,11.2417,,C 41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S 42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S 43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C -44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C -45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q +44,1,2,"Laroche, Miss Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C +45,1,3,"Devaney, Miss Margaret Delia",female,19,0,0,330958,7.8792,,Q 46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S 47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q -48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q +48,1,3,"O'Driscoll, Miss Bridget",female,,0,0,14311,7.75,,Q 49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C 50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S -51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S +51,0,3,"Panula, Master Juha Niilo",male,7,4,1,3101295,39.6875,,S 52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S 53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C 54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S 55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C 56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S -57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S +57,1,2,"Rugg, Miss Emily",female,21,0,0,C.A. 31026,10.5,,S 58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C -59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S -60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S +59,1,2,"West, Miss Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S +60,0,3,"Goodwin, Master William Frederick",male,11,5,2,CA 2144,46.9,,S 61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C -62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28, +62,1,1,"Icard, Miss Amelie",female,38,0,0,113572,80,B28, 63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S -64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S +64,0,3,"Skoog, Master Harald",male,4,3,2,347088,27.9,,S 65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C -66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C +66,1,3,"Moubarek, Master Gerios",male,,1,1,2661,15.2458,,C 67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S 68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S -69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S +69,1,3,"Andersson, Miss Erna Alexandra",female,17,4,2,3101281,7.925,,S 70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S 71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S -72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S +72,0,3,"Goodwin, Miss Lillian Amy",female,16,5,2,CA 2144,46.9,,S 73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S 74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C 75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S 76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S 77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S 78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S -79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S -80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S +79,1,2,"Caldwell, Master Alden Gates",male,0.83,0,2,248738,29,,S +80,1,3,"Dowdell, Miss Elizabeth",female,30,0,0,364516,12.475,,S 81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S 82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S -83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q +83,1,3,"McDermott, Miss Brigdet Delia",female,,0,0,330932,7.7875,,Q 84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S -85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S +85,1,2,"Ilett, Miss Bertha",female,17,0,0,SO/C 14885,10.5,,S 86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S 87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S 88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S -89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S +89,1,1,"Fortune, Miss Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S 90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S 91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S 92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S @@ -99,35 +99,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C 99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S 100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S -101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S +101,0,3,"Petranec, Miss Matilda",female,28,0,0,349245,7.8958,,S 102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S 103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S 104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S 105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S 106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S -107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S +107,1,3,"Salkjelsvik, Miss Anna Kristine",female,21,0,0,343120,7.65,,S 108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S 109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S -110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q +110,1,3,"Moran, Miss Bertha",female,,1,0,371110,24.15,,Q 111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S -112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C +112,0,3,"Zabour, Miss Hileni",female,14.5,1,0,2665,14.4542,,C 113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S -114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S -115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C +114,0,3,"Jussila, Miss Katriina",female,20,1,0,4136,9.825,,S +115,0,3,"Attalah, Miss Malake",female,17,0,0,2627,14.4583,,C 116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S 117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q 118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S 119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C -120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S +120,0,3,"Andersson, Miss Ellis Anna Maria",female,2,4,2,347082,31.275,,S 121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S 122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S 123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C -124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S +124,1,2,"Webber, Miss Susan",female,32.5,0,0,27267,13,E101,S 125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S -126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C +126,1,3,"Nicola-Yarred, Master Elias",male,12,1,0,2651,11.2417,,C 127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q 128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S -129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C +129,1,3,"Peter, Miss Anna",female,,1,1,2668,22.3583,F E69,C 130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S 131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C 132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S @@ -135,18 +135,18 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S 135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S 136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C -137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S +137,1,1,"Newsom, Miss Helen Monypeny",female,19,0,2,11752,26.2833,D47,S 138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S 139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S 140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C 141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C -142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S +142,1,3,"Nysten, Miss Anna Sofia",female,22,0,0,347081,7.75,,S 143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S 144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q 145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S 146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S 147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S -148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S +148,0,3,"Ford, Miss Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S 149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S 150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S 151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S @@ -155,35 +155,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S 155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S 156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C -157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q +157,1,3,"Gilnagh, Miss Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q 158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S 159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S -160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S +160,0,3,"Sage, Master Thomas Henry",male,,8,2,CA. 2343,69.55,,S 161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S 162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S 163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S 164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S -165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S -166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S +165,0,3,"Panula, Master Eino Viljami",male,1,4,1,3101295,39.6875,,S +166,1,3,"Goldsmith, Master Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S 167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S 168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S 169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S 170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S 171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S -172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q -173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S +172,0,3,"Rice, Master Arthur",male,4,4,1,382652,29.125,,Q +173,1,3,"Johnson, Miss Eleanor Ileen",female,1,1,1,347742,11.1333,,S 174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S 175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C 176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S -177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S -178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C +177,0,3,"Lefebre, Master Henry Forbes",male,,3,1,4133,25.4667,,S +178,0,1,"Isham, Miss Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C 179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S 180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S -181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S +181,0,3,"Sage, Miss Constance Gladys",female,,8,2,CA. 2343,69.55,,S 182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C -183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S -184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S -185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S +183,0,3,"Asplund, Master Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S +184,1,2,"Becker, Master Richard F",male,1,2,1,230136,39,F4,S +185,1,3,"Kink-Heilmann, Miss Luise Gretchen",female,4,0,2,315153,22.025,,S 186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S 187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q 188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S @@ -191,33 +191,33 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S 191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S 192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S -193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S -194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S +193,1,3,"Andersen-Jensen, Miss Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S +194,1,2,"Navratil, Master Michel M",male,3,1,1,230080,26,F2,S 195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C -196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C +196,1,1,"Lurette, Miss Elise",female,58,0,0,PC 17569,146.5208,B80,C 197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q 198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S -199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q -200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S +199,1,3,"Madigan, Miss Margaret ""Maggie""",female,,0,0,370370,7.75,,Q +200,0,2,"Yrois, Miss Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S 201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S 202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S 203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S 204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C 205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S -206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S +206,0,3,"Strom, Miss Telma Matilda",female,2,0,1,347054,10.4625,G6,S 207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S 208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C -209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q +209,1,3,"Carr, Miss Helen ""Ellen""",female,16,0,0,367231,7.75,,Q 210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C 211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S -212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S +212,1,2,"Cameron, Miss Clear Annie",female,35,0,0,F.C.C. 13528,21,,S 213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S 214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S 215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q -216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C -217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S +216,1,1,"Newell, Miss Madeleine",female,31,1,0,35273,113.275,D36,C +217,1,3,"Honkanen, Miss Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S 218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S -219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C +219,1,1,"Bazzani, Miss Albina",female,32,0,0,11813,76.2917,D15,C 220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S 221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S 222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S @@ -228,24 +228,24 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S 228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S 229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S -230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S +230,0,3,"Lefebre, Miss Mathilde",female,,3,1,4133,25.4667,,S 231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S 232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S 233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S -234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S +234,1,3,"Asplund, Miss Lillian Gertrud",female,5,4,2,347077,31.3875,,S 235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S -236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S +236,0,3,"Harknett, Miss Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S 237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S -238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S +238,1,2,"Collyer, Miss Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S 239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S 240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S -241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C -242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q +241,0,3,"Zabour, Miss Thamine",female,,1,0,2665,14.4542,,C +242,1,3,"Murphy, Miss Katherine ""Kate""",female,,1,0,367230,15.5,,Q 243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S 244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S 245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C 246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q -247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S +247,0,3,"Lindahl, Miss Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S 248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S 249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S 250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S @@ -256,28 +256,28 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S 256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C 257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C -258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S -259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C +258,1,1,"Cherry, Miss Gladys",female,30,0,0,110152,86.5,B77,S +259,1,1,"Ward, Miss Anna",female,35,0,0,PC 17755,512.3292,,C 260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S 261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q -262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S +262,1,3,"Asplund, Master Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S 263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S 264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S -265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q +265,0,3,"Henry, Miss Delia",female,,0,0,382649,7.75,,Q 266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S 267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S 268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S 269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S -270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S +270,1,1,"Bissette, Miss Amelia",female,35,0,0,PC 17760,135.6333,C99,S 271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S 272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S 273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S 274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C -275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q -276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S -277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S +275,1,3,"Healy, Miss Hanora ""Nora""",female,,0,0,370375,7.75,,Q +276,1,1,"Andrews, Miss Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S +277,0,3,"Lindblom, Miss Augusta Charlotta",female,45,0,0,347073,7.75,,S 278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S -279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q +279,0,3,"Rice, Master Eric",male,7,4,1,382652,29.125,,Q 280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S 281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q 282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S @@ -288,66 +288,66 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S 288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S 289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S -290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q -291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S +290,1,3,"Connolly, Miss Kate",female,22,0,0,370373,7.75,,Q +291,1,1,"Barber, Miss Ellen ""Nellie""",female,26,0,0,19877,78.85,,S 292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C 293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C -294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S +294,0,3,"Haas, Miss Aloisia",female,24,0,0,349236,8.85,,S 295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S 296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C 297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C -298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S +298,0,1,"Allison, Miss Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S 299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S 300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C -301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q +301,1,3,"Kelly, Miss Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q 302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q 303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S -304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q +304,1,2,"Keane, Miss Nora A",female,,0,0,226593,12.35,E101,Q 305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S -306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S -307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C +306,1,1,"Allison, Master Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S +307,1,1,"Fleming, Miss Margaret",female,,0,0,17421,110.8833,,C 308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C 309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C -310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C -311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C -312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C +310,1,1,"Francatelli, Miss Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C +311,1,1,"Hays, Miss Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C +312,1,1,"Ryerson, Miss Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C 313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S 314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S 315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S -316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S +316,1,3,"Nilsson, Miss Helmina Josefina",female,26,0,0,347470,7.8542,,S 317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S 318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S -319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S +319,1,1,"Wick, Miss Mary Natalie",female,31,0,2,36928,164.8667,C7,S 320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C 321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S 322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S -323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q +323,1,2,"Slayter, Miss Hilda Mary",female,30,0,0,234818,12.35,,Q 324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S 325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S -326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C +326,1,1,"Young, Miss Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C 327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S 328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S 329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S -330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C -331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q +330,1,1,"Hippach, Miss Jean Gertrude",female,16,0,1,111361,57.9792,B18,C +331,1,3,"McCoy, Miss Agnes",female,,2,0,367226,23.25,,Q 332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S 333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S 334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S 335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S 336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S 337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S -338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C +338,1,1,"Burns, Miss Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C 339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S 340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S -341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S -342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S +341,1,2,"Navratil, Master Edmond Roger",male,2,1,1,230080,26,F2,S +342,1,1,"Fortune, Miss Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S 343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S 344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S 345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S -346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S -347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S +346,1,2,"Brown, Miss Amelia ""Mildred""",female,24,0,0,248733,13,F33,S +347,1,2,"Smith, Miss Marion Elsie",female,40,0,0,31418,13,,S 348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S -349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S +349,1,3,"Coutts, Master William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S 350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S 351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S 352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S @@ -355,10 +355,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S 355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C 356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S -357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S -358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S -359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q -360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q +357,1,1,"Bowerman, Miss Elsie Edith",female,22,0,1,113505,55,E33,S +358,0,2,"Funk, Miss Annie Clemmer",female,38,0,0,237671,13,,S +359,1,3,"McGovern, Miss Mary",female,,0,0,330931,7.8792,,Q +360,1,3,"Mockler, Miss Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q 361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S 362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C 363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C @@ -367,58 +367,58 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S 367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C 368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C -369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q +369,1,3,"Jermyn, Miss Annie",female,,0,0,14313,7.75,,Q 370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C 371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C 372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S 373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S 374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C -375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S +375,0,3,"Palsson, Miss Stina Viola",female,3,3,1,349909,21.075,,S 376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C -377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S +377,1,3,"Landergren, Miss Aurora Adelia",female,22,0,0,C 7077,7.25,,S 378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C 379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C 380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S -381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C -382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C +381,1,1,"Bidois, Miss Rosalie",female,42,0,0,PC 17757,227.525,,C +382,1,3,"Nakid, Miss Maria (""Mary"")",female,1,0,2,2653,15.7417,,C 383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S 384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S 385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S 386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S -387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S -388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S +387,0,3,"Goodwin, Master Sidney Leonard",male,1,5,2,CA 2144,46.9,,S +388,1,2,"Buss, Miss Kate",female,36,0,0,27849,13,,S 389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q -390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C +390,1,2,"Lehmann, Miss Bertha",female,17,0,0,SC 1748,12,,C 391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S 392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S 393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S -394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C +394,1,1,"Newell, Miss Marjorie",female,23,1,0,35273,113.275,D36,C 395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S 396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S -397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S +397,0,3,"Olsson, Miss Elina",female,31,0,0,350407,7.8542,,S 398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S 399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S 400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S 401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S 402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S -403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S +403,0,3,"Jussila, Miss Mari Aina",female,21,1,0,4137,9.825,,S 404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S -405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S +405,0,3,"Oreskovic, Miss Marija",female,20,0,0,315096,8.6625,,S 406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S 407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S -408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S +408,1,2,"Richards, Master William Rowe",male,3,1,1,29106,18.75,,S 409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S -410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S +410,0,3,"Lefebre, Miss Ida",female,,3,1,4133,25.4667,,S 411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S 412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q -413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q +413,1,1,"Minahan, Miss Daisy E",female,33,1,0,19928,90,C78,Q 414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S 415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S 416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S 417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S -418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S +418,1,2,"Silven, Miss Lyyli Karoliina",female,18,0,2,250652,13,,S 419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S -420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S +420,0,3,"Van Impe, Miss Catharina",female,10,0,2,345773,24.15,,S 421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C 422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q 423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S @@ -426,7 +426,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S 426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S 427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S -428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S +428,1,2,"Phillips, Miss Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S 429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q 430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S 431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S @@ -434,8 +434,8 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S 434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S 435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S -436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S -437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S +436,1,1,"Carter, Miss Lucile Polk",female,14,1,2,113760,120,B96 B98,S +437,0,3,"Ford, Miss Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S 438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S 439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S 440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S @@ -444,10 +444,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S 444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S 445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S -446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S -447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S +446,1,1,"Dodge, Master Washington",male,4,0,2,33638,81.8583,A34,S +447,1,2,"Mellinger, Miss Madeleine Violet",female,13,0,1,250644,19.5,,S 448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S -449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C +449,1,3,"Baclini, Miss Marie Catherine",female,5,2,1,2666,19.2583,,C 450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S 451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S 452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S @@ -457,7 +457,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C 457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S 458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S -459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S +459,1,2,"Toomey, Miss Ellen",female,50,0,0,F.C.C. 13531,10.5,,S 460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q 461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S 462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S @@ -468,42 +468,42 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S 468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S 469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q -470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C +470,1,3,"Baclini, Miss Helene Barbara",female,0.75,2,1,2666,19.2583,,C 471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S 472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S 473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S 474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C -475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S +475,0,3,"Strandberg, Miss Ida Sofia",female,22,0,0,7553,9.8375,,S 476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S 477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S 478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S 479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S -480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S -481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S +480,1,3,"Hirvonen, Miss Hildur E",female,2,0,1,3101298,12.2875,,S +481,0,3,"Goodwin, Master Harold Victor",male,9,5,2,CA 2144,46.9,,S 482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S 483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S 484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S 485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C -486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S +486,0,3,"Lefebre, Miss Jeannie",female,,3,1,4133,25.4667,,S 487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S 488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C 489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S -490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S +490,1,3,"Coutts, Master Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S 491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S 492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S 493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S 494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C 495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S 496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C -497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C +497,1,1,"Eustis, Miss Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C 498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S 499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S 500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S 501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S -502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q -503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q -504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S -505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S +502,0,3,"Canavan, Miss Mary",female,21,0,0,364846,7.75,,Q +503,0,3,"O'Sullivan, Miss Bridget Mary",female,,0,0,330909,7.6292,,Q +504,0,3,"Laitinen, Miss Kristina Sofia",female,37,0,0,4135,9.5875,,S +505,1,1,"Maioni, Miss Roberta",female,16,0,0,110152,86.5,B79,S 506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C 507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S 508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S @@ -519,41 +519,41 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q 519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S 520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S -521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S +521,1,1,"Perreault, Miss Anne",female,30,0,0,12749,93.5,B73,S 522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S 523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C 524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C 525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C 526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q -527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S +527,1,2,"Ridsdale, Miss Lucy",female,50,0,0,W./C. 14258,10.5,,S 528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S 529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S 530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S -531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S +531,1,2,"Quick, Miss Phyllis May",female,2,1,1,26360,26,,S 532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C 533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C 534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C -535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S -536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S +535,0,3,"Cacic, Miss Marija",female,30,0,0,315084,8.6625,,S +536,1,2,"Hart, Miss Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S 537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S -538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C +538,1,1,"LeRoy, Miss Bertha",female,30,0,0,PC 17761,106.425,,C 539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S -540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C -541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S -542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S -543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S +540,1,1,"Frolicher, Miss Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C +541,1,1,"Crosby, Miss Harriet R",female,36,0,2,WE/P 5735,71,B22,S +542,0,3,"Andersson, Miss Ingeborg Constanzia",female,9,4,2,347082,31.275,,S +543,0,3,"Andersson, Miss Sigrid Elisabeth",female,11,4,2,347082,31.275,,S 544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S 545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C 546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S 547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S 548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C 549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S -550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S +550,1,2,"Davies, Master John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S 551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C 552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S 553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q 554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C -555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S +555,1,3,"Ohman, Miss Velin",female,22,0,0,347085,7.775,,S 556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S 557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C 558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C @@ -563,7 +563,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S 563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S 564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S -565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S +565,0,3,"Meanwell, Miss (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S 566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S 567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S 568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S @@ -572,19 +572,19 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S 572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S 573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S -574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q +574,1,3,"Kelly, Miss Mary",female,,0,0,14312,7.75,,Q 575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S 576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S -577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S +577,1,2,"Garside, Miss Ethel",female,34,0,0,243880,13,,S 578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S 579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C 580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S -581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S +581,1,2,"Christy, Miss Julie Rachel",female,25,1,1,237789,30,,S 582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C 583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S 584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C 585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C -586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S +586,1,1,"Taussig, Miss Ruth",female,18,0,2,110413,79.65,E68,S 587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S 588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C 589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S @@ -592,10 +592,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S 592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C 593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S -594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q +594,0,3,"Bourke, Miss Mary",female,,0,2,364848,7.75,,Q 595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S 596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S -597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S +597,1,2,"Leitch, Miss Jessie Wills",female,,0,0,248727,33,,S 598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S 599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C 600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C @@ -608,16 +608,16 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S 608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S 609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C -610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S +610,1,1,"Shutes, Miss Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S 611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S 612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S -613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q +613,1,3,"Murphy, Miss Margaret Jane",female,,1,0,367230,15.5,,Q 614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q 615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S -616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S +616,1,2,"Herman, Miss Alice",female,24,1,2,220845,65,,S 617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S 618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S -619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S +619,1,2,"Becker, Miss Marion Louise",female,4,2,1,230136,39,F4,S 620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S 621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C 622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S @@ -626,34 +626,34 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S 626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S 627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q -628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S +628,1,1,"Longley, Miss Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S 629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S 630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q 631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S 632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S 633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C 634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S -635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S -636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S +635,0,3,"Skoog, Miss Mabel",female,9,3,2,347088,27.9,,S +636,1,2,"Davis, Miss Mary",female,28,0,0,237668,13,,S 637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S 638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S 639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S 640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S 641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S 642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C -643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S +643,0,3,"Skoog, Miss Margit Elizabeth",female,2,3,2,347088,27.9,,S 644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S -645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C +645,1,3,"Baclini, Miss Eugenie",female,0.75,2,1,2666,19.2583,,C 646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C 647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S 648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C 649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S -650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S +650,1,3,"Stanley, Miss Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S 651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S -652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S +652,1,2,"Doling, Miss Elsie",female,18,0,1,231919,23,,S 653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S -654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q -655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q +654,1,3,"O'Leary, Miss Hanora ""Norah""",female,,0,0,330919,7.8292,,Q +655,0,3,"Hegarty, Miss Hanora ""Nora""",female,18,0,0,365226,6.75,,Q 656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S 657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S 658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q @@ -676,10 +676,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S 676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S 677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S -678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S +678,1,3,"Turja, Miss Anna Sofia",female,18,0,0,4138,9.8417,,S 679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S 680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C -681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q +681,0,3,"Peters, Miss Katie",female,,0,0,330935,8.1375,,Q 682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C 683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S 684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S @@ -688,48 +688,48 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S 688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S 689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S -690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S +690,1,1,"Madill, Miss Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S 691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S -692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C +692,1,3,"Karun, Miss Manca",female,4,0,1,349256,13.4167,,C 693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S 694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C 695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S 696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S 697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S -698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q +698,1,3,"Mullens, Miss Katherine ""Katie""",female,,0,0,35852,7.7333,,Q 699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C 700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S 701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C 702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S -703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C +703,0,3,"Barbara, Miss Saiide",female,18,0,1,2691,14.4542,,C 704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q 705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S 706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S 707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S 708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S -709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S -710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C +709,1,1,"Cleaver, Miss Alice",female,22,0,0,113781,151.55,,S +710,1,3,"Moubarek, Master Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C 711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C 712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S 713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S 714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S 715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S 716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S -717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C -718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S +717,1,1,"Endres, Miss Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C +718,1,2,"Troutt, Miss Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S 719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q 720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S -721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S +721,1,2,"Harper, Miss Annie Jessie ""Nina""",female,6,0,1,248727,33,,S 722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S 723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S 724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S 725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S 726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S 727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S -728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q +728,1,3,"Mannion, Miss Margareth",female,,0,0,36866,7.7375,,Q 729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S -730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S -731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S +730,0,3,"Ilmakangas, Miss Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S +731,1,1,"Allen, Miss Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S 732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C 733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S 734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S @@ -741,20 +741,20 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S 741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S 742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S -743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C +743,1,1,"Ryerson, Miss Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C 744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S 745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S 746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S 747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S -748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S +748,1,2,"Sinkkonen, Miss Anna",female,30,0,0,250648,13,,S 749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S 750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q -751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S -752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S +751,1,2,"Wells, Miss Joan",female,4,1,1,29103,23,,S +752,1,3,"Moor, Master Meier",male,6,0,1,392096,12.475,E121,S 753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S 754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S 755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S -756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S +756,1,2,"Hamalainen, Master Viljo",male,0.67,1,1,250649,14.5,,S 757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S 758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S 759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S @@ -766,7 +766,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S 766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S 767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C -768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q +768,0,3,"Mangan, Miss Mary",female,30.5,0,0,364850,7.75,,Q 769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q 770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S 771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S @@ -776,22 +776,22 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S 776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S 777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q -778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S +778,1,3,"Emanuel, Miss Virginia Ethel",female,5,0,0,364516,12.475,,S 779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q 780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S -781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C +781,1,3,"Ayoub, Miss Banoura",female,13,0,0,2687,7.2292,,C 782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S 783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S 784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S 785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S 786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S -787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S -788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q -789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S +787,1,3,"Sjoblom, Miss Anna Sofia",female,18,0,0,3101265,7.4958,,S +788,0,3,"Rice, Master George Hugh",male,8,4,1,382652,29.125,,Q +789,1,3,"Dean, Master Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S 790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C 791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q 792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S -793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S +793,0,3,"Sage, Miss Stella Anna",female,,8,2,CA. 2343,69.55,,S 794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C 795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S 796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S @@ -801,47 +801,47 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S 801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S 802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S -803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S -804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C +803,1,1,"Carter, Master William Thornton II",male,11,1,2,113760,120,B96 B98,S +804,1,3,"Thomas, Master Assad Alexander",male,0.42,0,1,2625,8.5167,,C 805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S 806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S 807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S -808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S +808,0,3,"Pettersson, Miss Ellen Natalia",female,18,0,0,347087,7.775,,S 809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S 810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S 811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S 812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S 813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S -814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S +814,0,3,"Andersson, Miss Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S 815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S 816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S -817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S +817,0,3,"Heininen, Miss Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S 818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C 819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S -820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S +820,0,3,"Skoog, Master Karl Thorsten",male,10,3,2,347088,27.9,,S 821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S 822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S 823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S 824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S -825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S +825,0,3,"Panula, Master Urho Abraham",male,2,4,1,3101295,39.6875,,S 826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q 827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S -828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C +828,1,2,"Mallet, Master Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C 829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q 830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28, 831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C -832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S +832,1,2,"Richards, Master George Sibley",male,0.83,1,1,29106,18.75,,S 833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C 834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S 835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S -836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C +836,1,1,"Compton, Miss Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C 837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S 838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S 839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S 840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C 841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S 842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S -843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C +843,1,1,"Serepeca, Miss Augusta",female,30,0,0,113798,31,,C 844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C 845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S 846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S @@ -849,10 +849,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C 849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S 850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C -851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S +851,0,3,"Andersson, Master Sigvard Harald Elias",male,4,4,2,347082,31.275,,S 852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S -853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C -854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S +853,0,3,"Boulos, Miss Nourelain",female,9,1,1,2678,15.2458,,C +854,1,1,"Lines, Miss Mary Conover",female,16,0,1,PC 17592,39.4,D28,S 855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S 856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S 857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S @@ -862,31 +862,31 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S 862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S 863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S -864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S +864,0,3,"Sage, Miss Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S 865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S 866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S -867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C +867,1,2,"Duran y More, Miss Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C 868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S 869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S -870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S +870,1,3,"Johnson, Master Harold Theodor",male,4,1,1,347742,11.1333,,S 871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S 872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S 873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S 874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S 875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C -876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C +876,1,3,"Najib, Miss Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C 877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S 878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S 879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S 880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C 881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S 882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S -883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S +883,0,3,"Dahlberg, Miss Gerda Ulrika",female,22,0,0,7552,10.5167,,S 884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S 885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S 886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q 887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S -888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S -889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S +888,1,1,"Graham, Miss Margaret Edith",female,19,0,0,112053,30,B42,S +889,0,3,"Johnston, Miss Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S 890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C 891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q diff --git a/doc/source/_static/schemas/01_table_spreadsheet.png b/doc/source/_static/schemas/01_table_spreadsheet.png index b3cf5a0245b9c..4e3497879de31 100644 Binary files a/doc/source/_static/schemas/01_table_spreadsheet.png and b/doc/source/_static/schemas/01_table_spreadsheet.png differ diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 325c902dd4f9e..0691414f53306 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -130,7 +130,7 @@ Consult the docs for setting up pyenv `here `__. pyenv virtualenv # For instance: - pyenv virtualenv 3.9.10 pandas-dev + pyenv virtualenv 3.10 pandas-dev # Activate the virtualenv pyenv activate pandas-dev diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index f958e4c4ad1fc..a3665c5bb4d1f 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -51,7 +51,7 @@ pandas may change the behavior of experimental features at any time. Python support ~~~~~~~~~~~~~~ -pandas mirrors the `NumPy guidelines for Python support `__. +pandas mirrors the `SPEC 0 guideline for Python support `__. Security policy ~~~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 01a79fc8e36fd..86ce05fde547b 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -161,7 +161,7 @@ Python terminal. >>> import pandas as pd >>> pd.test() - running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.9/site-packages/pandas + running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.10/site-packages/pandas ============================= test session starts ============================== platform linux -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index ff89589baefb1..efcdb22778ef4 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -46,7 +46,7 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno "Name": [ "Braund, Mr. Owen Harris", "Allen, Mr. William Henry", - "Bonnell, Miss. Elizabeth", + "Bonnell, Miss Elizabeth", ], "Age": [22, 35, 58], "Sex": ["male", "male", "female"], diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index 05a7d63b7ff47..a8b7a387d80ec 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -6,11 +6,11 @@ Package overview **************** -pandas is a `Python `__ package providing fast, +pandas is a `Python `__ package that provides fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the -fundamental high-level building block for doing practical, **real-world** data -analysis in Python. Additionally, it has the broader goal of becoming **the +fundamental high-level building block for Python's practical, **real-world** data +analysis. Additionally, it seeks to become **the most powerful and flexible open source data analysis/manipulation tool available in any language**. It is already well on its way toward this goal. diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 0ff40dcdcd150..5cdc9779ef4e1 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1606,7 +1606,7 @@ For instance: This method does not convert the row to a Series object; it merely returns the values inside a namedtuple. Therefore, :meth:`~DataFrame.itertuples` preserves the data type of the values -and is generally faster as :meth:`~DataFrame.iterrows`. +and is generally faster than :meth:`~DataFrame.iterrows`. .. note:: diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 7b2fd32303845..1e7d66dfeb142 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -793,7 +793,7 @@ Assigning a ``Categorical`` to parts of a column of other types will use the val :okwarning: df = pd.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}) - df.loc[1:2, "a"] = pd.Categorical(["b", "b"], categories=["a", "b"]) + df.loc[1:2, "a"] = pd.Categorical([2, 2], categories=[2, 3]) df.loc[2:3, "b"] = pd.Categorical(["b", "b"], categories=["a", "b"]) df df.dtypes diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index dc06dd9620c24..be40710a9e307 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2161,7 +2161,7 @@ a JSON string with two fields, ``schema`` and ``data``. { "A": [1, 2, 3], "B": ["a", "b", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=3), + "C": pd.date_range("2016-01-01", freq="D", periods=3), }, index=pd.Index(range(3), name="idx"), ) @@ -2270,7 +2270,7 @@ round-trippable manner. { "foo": [1, 2, 3, 4], "bar": ["a", "b", "c", "d"], - "baz": pd.date_range("2018-01-01", freq="d", periods=4), + "baz": pd.date_range("2018-01-01", freq="D", periods=4), "qux": pd.Categorical(["a", "b", "c", "c"]), }, index=pd.Index(range(4), name="idx"), @@ -3003,7 +3003,7 @@ However, if XPath does not reference node names such as default, ``/*``, then .. note:: Since ``xpath`` identifies the parent of content to be parsed, only immediate - desendants which include child nodes or current attributes are parsed. + descendants which include child nodes or current attributes are parsed. Therefore, ``read_xml`` will not parse the text of grandchildren or other descendants and will not parse attributes of any descendant. To retrieve lower level content, adjust xpath to lower level. For example, @@ -3535,7 +3535,7 @@ For example, to read in a ``MultiIndex`` index without names: df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df -If the index has level names, they will parsed as well, using the same +If the index has level names, they will be parsed as well, using the same parameters. .. ipython:: python @@ -4990,7 +4990,7 @@ Caveats convenience you can use ``store.flush(fsync=True)`` to do this for you. * Once a ``table`` is created columns (DataFrame) are fixed; only exactly the same columns can be appended -* Be aware that timezones (e.g., ``pytz.timezone('US/Eastern')``) +* Be aware that timezones (e.g., ``zoneinfo.ZoneInfo('US/Eastern')``) are not necessarily equal across timezone versions. So if data is localized to a specific timezone in the HDFStore using one version of a timezone library and that data is updated with another version, the data @@ -5169,6 +5169,8 @@ See the `Full Documentation `__. .. ipython:: python + import pytz + df = pd.DataFrame( { "a": list("abc"), @@ -5178,7 +5180,7 @@ See the `Full Documentation `__. "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), - "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "h": pd.date_range("20130101", periods=3, tz=pytz.timezone("US/Eastern")), "i": pd.date_range("20130101", periods=3, freq="ns"), } ) @@ -5847,10 +5849,10 @@ You can check if a table exists using :func:`~pandas.io.sql.has_table` Schema support '''''''''''''' -Reading from and writing to different schema's is supported through the ``schema`` +Reading from and writing to different schemas is supported through the ``schema`` keyword in the :func:`~pandas.read_sql_table` and :func:`~pandas.DataFrame.to_sql` functions. Note however that this depends on the database flavor (sqlite does not -have schema's). For example: +have schemas). For example: .. code-block:: python diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 66e42352754ae..e15939eb49239 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -60,7 +60,7 @@ To detect these missing value, use the :func:`isna` or :func:`notna` methods. .. warning:: - Equality compaisons between ``np.nan``, :class:`NaT`, and :class:`NA` + Equality comparisons between ``np.nan``, :class:`NaT`, and :class:`NA` do not act like ``None`` .. ipython:: python @@ -319,7 +319,7 @@ Missing values propagate through arithmetic operations between pandas objects. The descriptive statistics and computational methods discussed in the :ref:`data structure overview ` (and listed :ref:`here -` and :ref:`here `) are all +` and :ref:`here `) all account for missing data. When summing data, NA values or empty data will be treated as zero. diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index ce805f98ca528..7757d95c2bccd 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -8,7 +8,7 @@ Options and settings Overview -------- -pandas has an options API configure and customize global behavior related to +pandas has an options API to configure and customize global behavior related to :class:`DataFrame` display, data behavior and more. Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``). diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 5daf204f39bcf..01df17bac5fd7 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -35,7 +35,7 @@ You can construct a ``Timedelta`` scalar through various arguments, including `I pd.Timedelta(days=1, seconds=1) # integers with a unit - pd.Timedelta(1, unit="d") + pd.Timedelta(1, unit="D") # from a datetime.timedelta/np.timedelta64 pd.Timedelta(datetime.timedelta(days=1, seconds=1)) @@ -94,7 +94,7 @@ is numeric: .. ipython:: python pd.to_timedelta(np.arange(5), unit="s") - pd.to_timedelta(np.arange(5), unit="d") + pd.to_timedelta(np.arange(5), unit="D") .. warning:: If a string or array of strings is passed as an input then the ``unit`` keyword diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index d5137baa95ab8..0845417e4910d 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1479,7 +1479,7 @@ or some other non-observed day. Defined observance rules are: "after_nearest_workday", "apply ``nearest_workday`` and then move to next workday after that day" "sunday_to_monday", "move Sunday to following Monday" "next_monday_or_tuesday", "move Saturday to Monday and Sunday/Monday to Tuesday" - "previous_friday", move Saturday and Sunday to previous Friday" + "previous_friday", "move Saturday and Sunday to previous Friday" "next_monday", "move Saturday and Sunday to following Monday" "weekend_to_monday", "same as ``next_monday``" @@ -2337,7 +2337,7 @@ Time zone handling ------------------ pandas provides rich support for working with timestamps in different time -zones using the ``pytz`` and ``dateutil`` libraries or :class:`datetime.timezone` +zones using the ``zoneinfo``, ``pytz`` and ``dateutil`` libraries or :class:`datetime.timezone` objects from the standard library. @@ -2354,14 +2354,14 @@ By default, pandas objects are time zone unaware: To localize these dates to a time zone (assign a particular time zone to a naive date), you can use the ``tz_localize`` method or the ``tz`` keyword argument in :func:`date_range`, :class:`Timestamp`, or :class:`DatetimeIndex`. -You can either pass ``pytz`` or ``dateutil`` time zone objects or Olson time zone database strings. +You can either pass ``zoneinfo``, ``pytz`` or ``dateutil`` time zone objects or Olson time zone database strings. Olson time zone strings will return ``pytz`` time zone objects by default. To return ``dateutil`` time zone objects, append ``dateutil/`` before the string. -* In ``pytz`` you can find a list of common (and less common) time zones using - ``from pytz import common_timezones, all_timezones``. +* For ``zoneinfo``, a list of available timezones are available from :py:func:`zoneinfo.available_timezones`. +* In ``pytz`` you can find a list of common (and less common) time zones using ``pytz.all_timezones``. * ``dateutil`` uses the OS time zones so there isn't a fixed list available. For - common zones, the names are the same as ``pytz``. + common zones, the names are the same as ``pytz`` and ``zoneinfo``. .. ipython:: python @@ -2466,7 +2466,7 @@ you can use the ``tz_convert`` method. .. warning:: - If you are using dates beyond 2038-01-18, due to current deficiencies + If you are using dates beyond 2038-01-18 with ``pytz``, due to current deficiencies in the underlying libraries caused by the year 2038 problem, daylight saving time (DST) adjustments to timezone aware dates will not be applied. If and when the underlying libraries are fixed, the DST transitions will be applied. @@ -2475,9 +2475,11 @@ you can use the ``tz_convert`` method. .. ipython:: python + import pytz + d_2037 = "2037-03-31T010101" d_2038 = "2038-03-31T010101" - DST = "Europe/London" + DST = pytz.timezone("Europe/London") assert pd.Timestamp(d_2037, tz=DST) != pd.Timestamp(d_2037, tz="GMT") assert pd.Timestamp(d_2038, tz=DST) == pd.Timestamp(d_2038, tz="GMT") diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 3c5488a47bdf2..8e323d8aac5e3 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -523,13 +523,25 @@ Enhancements Using the new top-level ``to_timedelta``, you can convert a scalar or array from the standard timedelta format (produced by ``to_csv``) into a timedelta type (``np.timedelta64`` in ``nanoseconds``). - .. ipython:: python + .. code-block:: ipython + + In [53]: pd.to_timedelta('1 days 06:05:01.00003') + Out[53]: Timedelta('1 days 06:05:01.000030') + + In [54]: pd.to_timedelta('15.5us') + Out[54]: Timedelta('0 days 00:00:00.000015500') + + In [55]: pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + Out[55]: TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT], dtype='timedelta64[ns]', freq=None) + + In [56]: pd.to_timedelta(np.arange(5), unit='s') + Out[56]: + TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', + '0 days 00:00:03', '0 days 00:00:04'], + dtype='timedelta64[ns]', freq=None) - pd.to_timedelta('1 days 06:05:01.00003') - pd.to_timedelta('15.5us') - pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) - pd.to_timedelta(np.arange(5), unit='s') - pd.to_timedelta(np.arange(5), unit='d') + In [57]: pd.to_timedelta(np.arange(5), unit='d') + Out[57]: TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) A Series of dtype ``timedelta64[ns]`` can now be divided by another ``timedelta64[ns]`` object, or astyped to yield a ``float64`` dtyped Series. This diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index 569197fe9daf5..563035e0e2940 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -322,15 +322,28 @@ Tz-aware are rounded, floored and ceiled in local times Timedeltas -.. ipython:: python +.. code-block:: ipython + + In [37]: t = pd.timedelta_range('1 days 2 hr 13 min 45 us', periods=3, freq='d') - t = pd.timedelta_range('1 days 2 hr 13 min 45 us', periods=3, freq='d') - t - t.round('10min') + In [38]: t + Out[38]: + TimedeltaIndex(['1 days 02:13:00.000045', '2 days 02:13:00.000045', + '3 days 02:13:00.000045'], + dtype='timedelta64[ns]', freq='D') + + In [39]: t.round('10min') + Out[39]: + TimedeltaIndex(['1 days 02:10:00', '2 days 02:10:00', + '3 days 02:10:00'], + dtype='timedelta64[ns]', freq=None) # Timedelta scalar - t[0] - t[0].round('2h') + In [40]: t[0] + Out[40]: Timedelta('1 days 02:13:00.000045') + + In [41]: t[0].round('2h') + Out[41]: Timedelta('1 days 02:00:00') In addition, ``.round()``, ``.floor()`` and ``.ceil()`` will be available through the ``.dt`` accessor of ``Series``. diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index f63db945165e7..d6d1d96ccc878 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -308,15 +308,26 @@ The new orient ``'table'`` for :meth:`DataFrame.to_json` will generate a `Table Schema`_ compatible string representation of the data. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame( - {'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, - index=pd.Index(range(3), name='idx')) - df - df.to_json(orient='table') + In [38]: df = pd.DataFrame( + ....: {'A': [1, 2, 3], + ....: 'B': ['a', 'b', 'c'], + ....: 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, + ....: index=pd.Index(range(3), name='idx')) + In [39]: df + Out[39]: + A B C + idx + 0 1 a 2016-01-01 + 1 2 b 2016-01-02 + 2 3 c 2016-01-03 + + [3 rows x 3 columns] + + In [40]: df.to_json(orient='table') + Out[40]: + '{"schema":{"fields":[{"name":"idx","type":"integer"},{"name":"A","type":"integer"},{"name":"B","type":"string"},{"name":"C","type":"datetime"}],"primaryKey":["idx"],"pandas_version":"1.4.0"},"data":[{"idx":0,"A":1,"B":"a","C":"2016-01-01T00:00:00.000"},{"idx":1,"A":2,"B":"b","C":"2016-01-02T00:00:00.000"},{"idx":2,"A":3,"B":"c","C":"2016-01-03T00:00:00.000"}]}' See :ref:`IO: Table Schema for more information `. diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index a33a8f7addeef..8a9227ac37b67 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -157,16 +157,27 @@ sum and ``1`` for product. *pandas 0.22.0* -.. ipython:: python +.. code-block:: ipython + + In [11]: s = pd.Series([1, 1, np.nan, np.nan], + ....: index=pd.date_range("2017", periods=4)) - s = pd.Series([1, 1, np.nan, np.nan], index=pd.date_range("2017", periods=4)) - s.resample("2d").sum() + In [12]: s.resample("2d").sum() + Out[12]: + 2017-01-01 2.0 + 2017-01-03 0.0 + Freq: 2D, Length: 2, dtype: float64 To restore the 0.21 behavior of returning ``NaN``, use ``min_count>=1``. -.. ipython:: python +.. code-block:: ipython + + In [13]: s.resample("2d").sum(min_count=1) + Out[13]: + 2017-01-01 2.0 + 2017-01-03 NaN + Freq: 2D, Length: 2, dtype: float64 - s.resample("2d").sum(min_count=1) In particular, upsampling and taking the sum or product is affected, as upsampling introduces missing values even if the original series was diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 808741ccf4475..663b47a4d2d55 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -50,19 +50,55 @@ JSON read/write round-trippable with ``orient='table'`` A ``DataFrame`` can now be written to and subsequently read back via JSON while preserving metadata through usage of the ``orient='table'`` argument (see :issue:`18912` and :issue:`9146`). Previously, none of the available ``orient`` values guaranteed the preservation of dtypes and index names, amongst other metadata. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame({'foo': [1, 2, 3, 4], - 'bar': ['a', 'b', 'c', 'd'], - 'baz': pd.date_range('2018-01-01', freq='d', periods=4), - 'qux': pd.Categorical(['a', 'b', 'c', 'c'])}, - index=pd.Index(range(4), name='idx')) - df - df.dtypes - df.to_json('test.json', orient='table') - new_df = pd.read_json('test.json', orient='table') - new_df - new_df.dtypes + In [1]: df = pd.DataFrame({'foo': [1, 2, 3, 4], + ...: 'bar': ['a', 'b', 'c', 'd'], + ...: 'baz': pd.date_range('2018-01-01', freq='d', periods=4), + ...: 'qux': pd.Categorical(['a', 'b', 'c', 'c'])}, + ...: index=pd.Index(range(4), name='idx')) + + In [2]: df + Out[2]: + foo bar baz qux + idx + 0 1 a 2018-01-01 a + 1 2 b 2018-01-02 b + 2 3 c 2018-01-03 c + 3 4 d 2018-01-04 c + + [4 rows x 4 columns] + + In [3]: df.dtypes + Out[3]: + foo int64 + bar object + baz datetime64[ns] + qux category + Length: 4, dtype: object + + In [4]: df.to_json('test.json', orient='table') + + In [5]: new_df = pd.read_json('test.json', orient='table') + + In [6]: new_df + Out[6]: + foo bar baz qux + idx + 0 1 a 2018-01-01 a + 1 2 b 2018-01-02 b + 2 3 c 2018-01-03 c + 3 4 d 2018-01-04 c + + [4 rows x 4 columns] + + In [7]: new_df.dtypes + Out[7]: + foo int64 + bar object + baz datetime64[ns] + qux category + Length: 4, dtype: object Please note that the string ``index`` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 07f5b01709223..cd917924880f1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -39,8 +39,10 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) +- :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) @@ -190,6 +192,11 @@ In cases with mixed-resolution inputs, the highest resolution is used: .. _whatsnew_300.api_breaking.deps: +Increased minimum version for Python +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas 3.0.0 supports Python 3.10 and higher. + Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Some minimum supported versions of dependencies were updated. @@ -224,6 +231,7 @@ Other API changes - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) - :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`) - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) +- Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`) @@ -272,7 +280,10 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`) - Deprecated behavior of :meth:`Series.dt.to_pytimedelta`, in a future version this will return a :class:`Series` containing python ``datetime.timedelta`` objects instead of an ``ndarray`` of timedelta; this matches the behavior of other :meth:`Series.dt` properties. (:issue:`57463`) +- Deprecated lowercase strings ``d``, ``b`` and ``c`` denoting frequencies in :class:`Day`, :class:`BusinessDay` and :class:`CustomBusinessDay` in favour of ``D``, ``B`` and ``C`` (:issue:`58998`) +- Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`) - Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`) +- Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) .. --------------------------------------------------------------------------- @@ -350,6 +361,7 @@ Other Removals - Changed the default value of ``na_action`` in :meth:`Categorical.map` to ``None`` (:issue:`51645`) - Changed the default value of ``observed`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` to ``True`` (:issue:`51811`) - Enforce deprecation in :func:`testing.assert_series_equal` and :func:`testing.assert_frame_equal` with object dtype and mismatched null-like values, which are now considered not-equal (:issue:`18463`) +- Enforce banning of upcasting in in-place setitem-like operations (:issue:`59007`) (see `PDEP6 `_) - Enforced deprecation ``all`` and ``any`` reductions with ``datetime64``, :class:`DatetimeTZDtype`, and :class:`PeriodDtype` dtypes (:issue:`58029`) - Enforced deprecation disallowing ``float`` "periods" in :func:`date_range`, :func:`period_range`, :func:`timedelta_range`, :func:`interval_range`, (:issue:`56036`) - Enforced deprecation disallowing parsing datetimes with mixed time zones unless user passes ``utc=True`` to :func:`to_datetime` (:issue:`57275`) @@ -371,9 +383,13 @@ Other Removals - Enforced deprecation of string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57699`) - Enforced deprecation of string ``BAS`` denoting frequency in :class:`BYearBegin` and strings ``BAS-DEC``, ``BAS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`57793`) - Enforced deprecation of string ``BA`` denoting frequency in :class:`BYearEnd` and strings ``BA-DEC``, ``BA-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57793`) +- Enforced deprecation of strings ``H``, ``BH``, and ``CBH`` denoting frequencies in :class:`Hour`, :class:`BusinessHour`, :class:`CustomBusinessHour` (:issue:`59143`) +- Enforced deprecation of strings ``H``, ``BH``, and ``CBH`` denoting units in :class:`Timedelta` (:issue:`59143`) - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`) - Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`) +- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` that would introduce new categories. (:issue:`58270`) +- Enforced deprecation of the behavior of :meth:`Series.argsort` in the presence of NA values (:issue:`58232`) - Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`) - Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Enforced silent-downcasting deprecation for :ref:`all relevant methods ` (:issue:`54710`) @@ -489,6 +505,8 @@ Datetimelike - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) +- Bug in :meth:`DatetimeIndex.union` when ``unit`` was non-nanosecond (:issue:`59036`) +- Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta @@ -503,8 +521,8 @@ Timezones Numeric ^^^^^^^ +- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`) - Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`) -- Conversion ^^^^^^^^^^ @@ -537,16 +555,23 @@ MultiIndex ^^^^^^^^^^ - :func:`DataFrame.loc` with ``axis=0`` and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`) - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) +- :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) - I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) +- Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) +- Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) +- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) +- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) +- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) Period @@ -571,6 +596,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) +- Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) @@ -578,11 +604,12 @@ Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) +- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) Sparse ^^^^^^ - Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) -- +- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`) ExtensionArray ^^^^^^^^^^^^^^ @@ -596,8 +623,11 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) +- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) +- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) @@ -605,6 +635,7 @@ Other - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) +- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 95c549a8ff0e8..51794ec04b29e 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -55,7 +55,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, NamedTuple, cast, ) @@ -66,6 +65,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Sequence, ) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 15f8727c38f8d..d7e485f74e58b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -398,8 +398,14 @@ def group_cumsum( for i in range(N): lab = labels[i] - if lab < 0: + if uses_mask and lab < 0: + # GH#58811 + result_mask[i, :] = True + out[i, :] = 0 + continue + elif lab < 0: continue + for j in range(K): val = values[i, j] diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c index 4c1969f6d9f57..2c32fb0481486 100644 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ b/pandas/_libs/src/datetime/pd_datetime.c @@ -245,7 +245,12 @@ static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) { } static PyModuleDef_Slot pandas_datetime_slots[] = { - {Py_mod_exec, pandas_datetime_exec}, {0, NULL}}; + {Py_mod_exec, pandas_datetime_exec}, +#if PY_VERSION_HEX >= 0x030D0000 + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, +#endif + {0, NULL}, +}; static struct PyModuleDef pandas_datetimemodule = { PyModuleDef_HEAD_INIT, diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c index 48f3cd14cbc30..51cdf071a15cf 100644 --- a/pandas/_libs/src/parser/pd_parser.c +++ b/pandas/_libs/src/parser/pd_parser.c @@ -161,7 +161,12 @@ static int pandas_parser_exec(PyObject *Py_UNUSED(module)) { } static PyModuleDef_Slot pandas_parser_slots[] = { - {Py_mod_exec, pandas_parser_exec}, {0, NULL}}; + {Py_mod_exec, pandas_parser_exec}, +#if PY_VERSION_HEX >= 0x030D0000 + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, +#endif + {0, NULL}, +}; static struct PyModuleDef pandas_parsermodule = { PyModuleDef_HEAD_INIT, diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index fa91db5fe34e3..5f35860c59cb7 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -410,8 +410,8 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, (int)npyarr->ndim); npyarr->stridedim = npyarr->ndim; npyarr->index[npyarr->ndim] = 0; npyarr->inc = -1; @@ -452,8 +452,8 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { return; } const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->dataptr += npyarr->stride; NpyArr_freeItemValue(obj, tc); @@ -524,8 +524,8 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { } const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->index[npyarr->stridedim] = 0; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 455bca35d160a..d8c536a34bc04 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -14,11 +14,12 @@ cdef bint is_supported_unit(NPY_DATETIMEUNIT reso) cdef dict c_OFFSET_TO_PERIOD_FREQSTR cdef dict c_PERIOD_TO_OFFSET_FREQSTR cdef dict c_OFFSET_RENAMED_FREQSTR -cdef dict c_DEPR_ABBREVS +cdef dict c_DEPR_UNITS cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname cdef dict attrname_to_npy_unit +cdef str INVALID_FREQ_ERR_MSG cdef enum c_FreqGroup: # Mirrors FreqGroup in the .pyx file diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 479a5a328b1d8..7e6e382c17cc6 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -1,9 +1,6 @@ # period frequency constants corresponding to scikits timeseries # originals from enum import Enum -import warnings - -from pandas.util._exceptions import find_stack_level from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS from pandas._libs.tslibs.np_datetime cimport ( @@ -338,19 +335,34 @@ PERIOD_TO_OFFSET_FREQSTR = { cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR cdef dict c_PERIOD_TO_OFFSET_FREQSTR = PERIOD_TO_OFFSET_FREQSTR -# Map deprecated resolution abbreviations to correct resolution abbreviations -cdef dict c_DEPR_ABBREVS = { +cdef dict c_DEPR_UNITS = { + "w": "W", + "d": "D", "H": "h", - "BH": "bh", - "CBH": "cbh", + "MIN": "min", "S": "s", + "MS": "ms", + "US": "us", + "NS": "ns", } cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR = { "w": "W", + "w-mon": "W-MON", + "w-tue": "W-TUE", + "w-wed": "W-WED", + "w-thu": "W-THU", + "w-fri": "W-FRI", + "w-sat": "W-SAT", + "w-sun": "W-SUN", + "d": "D", + "b": "B", + "c": "C", "MIN": "min", } +cdef str INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" + class FreqGroup(Enum): # Mirrors c_FreqGroup in the .pxd file @@ -440,43 +452,18 @@ class Resolution(Enum): >>> Resolution.get_reso_from_freqstr('h') == Resolution.RESO_HR True """ - cdef: - str abbrev - if freq in {"T", "t", "L", "l", "U", "u", "N", "n"}: - raise ValueError( - f"Frequency \'{freq}\' is no longer supported." - ) try: - if freq in c_DEPR_ABBREVS: - abbrev = c_DEPR_ABBREVS[freq] - warnings.warn( - f"\'{freq}\' is deprecated and will be removed in a future " - f"version. Please use \'{abbrev}\' " - f"instead of \'{freq}\'.", - FutureWarning, - stacklevel=find_stack_level(), - ) - freq = abbrev attr_name = _abbrev_to_attrnames[freq] - except KeyError: + except KeyError as exc: + msg = INVALID_FREQ_ERR_MSG.format(freq) # For quarterly and yearly resolutions, we need to chop off # a month string. split_freq = freq.split("-") if len(split_freq) != 2: - raise + raise ValueError(msg) from exc if split_freq[1] not in _month_names: # i.e. we want e.g. "Q-DEC", not "Q-INVALID" - raise - if split_freq[0] in c_DEPR_ABBREVS: - abbrev = c_DEPR_ABBREVS[split_freq[0]] - warnings.warn( - f"\'{split_freq[0]}\' is deprecated and will be removed in a " - f"future version. Please use \'{abbrev}\' " - f"instead of \'{split_freq[0]}\'.", - FutureWarning, - stacklevel=find_stack_level(), - ) - split_freq[0] = abbrev + raise ValueError(msg) from exc attr_name = _abbrev_to_attrnames[split_freq[0]] return cls.from_attrname(attr_name) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index c483814a3ef74..4544cf56a11ec 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -633,6 +633,16 @@ class NaTType(_NaT): """ Return time object with same time but with tzinfo=None. + This method extracts the time part of the `Timestamp` object, excluding any + timezone information. It returns a `datetime.time` object which only represents + the time (hours, minutes, seconds, and microseconds). + + See Also + -------- + Timestamp.date : Return date object with same year, month and day. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -841,7 +851,7 @@ class NaTType(_NaT): Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -894,7 +904,7 @@ class NaTType(_NaT): ---------- ordinal : int Date corresponding to a proleptic Gregorian ordinal. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. Notes @@ -957,11 +967,21 @@ class NaTType(_NaT): """ Return new Timestamp object representing current time local to tz. + This method returns a new `Timestamp` object that represents the current time. + If a timezone is provided, the current time will be localized to that timezone. + Otherwise, it returns the current local time. + Parameters ---------- tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.today : Return the current time in the local timezone. + Examples -------- >>> pd.Timestamp.now() # doctest: +SKIP @@ -1139,6 +1159,12 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.ceil : Round up a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.floor : Round down the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, flooring will take place relative to the @@ -1301,7 +1327,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -1355,7 +1381,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding local time. @@ -1455,13 +1481,13 @@ default 'raise' Replace timezone (not a conversion): - >>> import pytz - >>> ts.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> import zoneinfo + >>> ts.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific') Analogous for ``pd.NaT``: - >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> pd.NaT.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) NaT """, ) @@ -1490,7 +1516,7 @@ default 'raise' See Also -------- - Timestamp.asm8 : Return numpy datetime64 format in nanoseconds. + Timestamp.asm8 : Return numpy datetime64 format with same precision. Timestamp.to_pydatetime : Convert Timestamp object to a native Python datetime object. to_timedelta : Convert argument into timedelta object, diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index a24941e4f0a5a..0afeb002a8151 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -56,7 +56,6 @@ from pandas._libs.tslibs.ccalendar cimport ( ) from pandas._libs.tslibs.conversion cimport localize_pydatetime from pandas._libs.tslibs.dtypes cimport ( - c_DEPR_ABBREVS, c_OFFSET_RENAMED_FREQSTR, c_OFFSET_TO_PERIOD_FREQSTR, c_PERIOD_AND_OFFSET_DEPR_FREQSTR, @@ -4890,16 +4889,16 @@ cpdef to_offset(freq, bint is_period=False): ) name = c_PERIOD_TO_OFFSET_FREQSTR.get(name.upper()) - if name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR: - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " - f" instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name) + if name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR: + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " + f" instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + name = c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") prefix = _lite_rule_alias.get(name) or name @@ -4908,16 +4907,6 @@ cpdef to_offset(freq, bint is_period=False): if not stride: stride = 1 - if prefix in c_DEPR_ABBREVS: - warnings.warn( - f"\'{prefix}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_DEPR_ABBREVS.get(prefix)}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - prefix = c_DEPR_ABBREVS[prefix] - if prefix in {"D", "h", "min", "s", "ms", "us", "ns"}: # For these prefixes, we have something like "3h" or # "2.5min", so we can construct a Timedelta with the @@ -4948,7 +4937,12 @@ cpdef to_offset(freq, bint is_period=False): if result is None: raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) - if is_period and not hasattr(result, "_period_dtype_code"): + try: + has_period_dtype_code = hasattr(result, "_period_dtype_code") + except ValueError: + has_period_dtype_code = False + + if is_period and not has_period_dtype_code: if isinstance(freq, str): raise ValueError(f"{result.name} is not supported as period frequency") else: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 023a0f52e320f..c6ba97fe9f1a2 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2472,11 +2472,24 @@ cdef class _Period(PeriodMixin): """ Return the period of now's date. + The `now` method provides a convenient way to generate a period + object for the current date and time. This can be particularly + useful in financial and economic analysis, where data is often + collected and analyzed in regular intervals (e.g., hourly, daily, + monthly). By specifying the frequency, users can create periods + that match the granularity of their data. + Parameters ---------- freq : str, BaseOffset Frequency to use for the returned period. + See Also + -------- + to_datetime : Convert argument to datetime. + Period : Represents a period of time. + Period.to_timestamp : Return the Timestamp representation of the Period. + Examples -------- >>> pd.Period.now('h') # doctest: +SKIP diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 24ec6c8891a89..979a5666661b2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -39,8 +39,6 @@ UnitChoices: TypeAlias = Literal[ "minute", "min", "minutes", - "T", - "t", "s", "seconds", "sec", @@ -50,21 +48,17 @@ UnitChoices: TypeAlias = Literal[ "millisecond", "milli", "millis", - "L", - "l", "us", "microseconds", "microsecond", "µs", "micro", "micros", - "u", "ns", "nanoseconds", "nano", "nanos", "nanosecond", - "n", ] _S = TypeVar("_S", bound=timedelta) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 4ff2df34ac717..d5348311f19e2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -43,7 +43,7 @@ from pandas._libs.tslibs.conversion cimport ( precision_from_unit, ) from pandas._libs.tslibs.dtypes cimport ( - c_DEPR_ABBREVS, + c_DEPR_UNITS, get_supported_reso, is_supported_unit, npy_unit_to_abbrev, @@ -719,15 +719,15 @@ cpdef inline str parse_timedelta_unit(str unit): return "ns" elif unit == "M": return unit - elif unit in c_DEPR_ABBREVS: + elif unit in c_DEPR_UNITS: warnings.warn( f"\'{unit}\' is deprecated and will be removed in a " - f"future version. Please use \'{c_DEPR_ABBREVS.get(unit)}\' " + f"future version. Please use \'{c_DEPR_UNITS.get(unit)}\' " f"instead of \'{unit}\'.", FutureWarning, stacklevel=find_stack_level(), ) - unit = c_DEPR_ABBREVS[unit] + unit = c_DEPR_UNITS[unit] try: return timedelta_abbrevs[unit.lower()] except KeyError: @@ -1078,10 +1078,22 @@ cdef class _Timedelta(timedelta): """ Returns the days of the timedelta. + The `days` attribute of a `pandas.Timedelta` object provides the number + of days represented by the `Timedelta`. This is useful for extracting + the day component from a `Timedelta` that may also include hours, minutes, + seconds, and smaller time units. This attribute simplifies the process + of working with durations where only the day component is of interest. + Returns ------- int + See Also + -------- + Timedelta.seconds : Returns the seconds component of the timedelta. + Timedelta.microseconds : Returns the microseconds component of the timedelta. + Timedelta.total_seconds : Returns the total duration in seconds. + Examples -------- >>> td = pd.Timedelta(1, "d") @@ -1731,6 +1743,12 @@ cdef class _Timedelta(timedelta): ------- Timedelta + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + to_timedelta : Convert argument to timedelta. + Timedelta.asm8 : Return a numpy timedelta64 array scalar view. + Examples -------- >>> td = pd.Timedelta('1001ms') @@ -1800,10 +1818,10 @@ class Timedelta(_Timedelta): * 'microseconds', 'microsecond', 'micros', 'micro', or 'us' * 'nanoseconds', 'nanosecond', 'nanos', 'nano', or 'ns'. - .. deprecated:: 2.2.0 + .. deprecated:: 3.0.0 - Values `H`, `T`, `S`, `L`, `U`, and `N` are deprecated in favour - of the values `h`, `min`, `s`, `ms`, `us`, and `ns`. + Allowing the values `w`, `d`, `MIN`, `MS`, `US` and `NS` to denote units + are deprecated in favour of the values `W`, `D`, `min`, `ms`, `us` and `ns`. **kwargs Available kwargs: {days, seconds, microseconds, diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 04bd439b40b8d..cd749effd1a5f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1139,7 +1139,7 @@ cdef class _Timestamp(ABCTimestamp): See Also -------- - Timestamp.asm8 : Return numpy datetime64 format in nanoseconds. + Timestamp.asm8 : Return numpy datetime64 format with same precision. Timestamp.to_pydatetime : Convert Timestamp object to a native Python datetime object. to_timedelta : Convert argument into timedelta object, @@ -1170,7 +1170,7 @@ cdef class _Timestamp(ABCTimestamp): @property def asm8(self) -> np.datetime64: """ - Return numpy datetime64 format in nanoseconds. + Return numpy datetime64 format with same precision. See Also -------- @@ -1374,11 +1374,11 @@ class Timestamp(_Timestamp): Timezone info. nanosecond : int, optional, default 0 Value of nanosecond. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will have. unit : str Unit used for conversion if ts_input is of type int or float. The - valid values are 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For + valid values are 'W', 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For example, 's' means seconds and 'ms' means milliseconds. For float inputs, the result will be stored in nanoseconds, and @@ -1417,6 +1417,11 @@ class Timestamp(_Timestamp): >>> pd.Timestamp(1513393355.5, unit='s') Timestamp('2017-12-16 03:02:35.500000') + This converts an int representing a Unix-epoch in units of weeks + + >>> pd.Timestamp(1535, unit='W') + Timestamp('1999-06-03 00:00:00') + This converts an int representing a Unix-epoch in units of seconds and for a particular timezone @@ -1441,7 +1446,7 @@ class Timestamp(_Timestamp): ---------- ordinal : int Date corresponding to a proleptic Gregorian ordinal. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. Notes @@ -1460,11 +1465,21 @@ class Timestamp(_Timestamp): """ Return new Timestamp object representing current time local to tz. + This method returns a new `Timestamp` object that represents the current time. + If a timezone is provided, the current time will be localized to that timezone. + Otherwise, it returns the current local time. + Parameters ---------- tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.today : Return the current time in the local timezone. + Examples -------- >>> pd.Timestamp.now() # doctest: +SKIP @@ -1763,6 +1778,16 @@ class Timestamp(_Timestamp): """ Return time object with same time but with tzinfo=None. + This method extracts the time part of the `Timestamp` object, excluding any + timezone information. It returns a `datetime.time` object which only represents + the time (hours, minutes, seconds, and microseconds). + + See Also + -------- + Timestamp.date : Return date object with same year, month and day. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -2202,6 +2227,12 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.ceil : Round up a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.floor : Round down the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, flooring will take place relative to the @@ -2357,6 +2388,17 @@ timedelta}, default 'raise' """ Alias for tzinfo. + The `tz` property provides a simple and direct way to retrieve the timezone + information of a `Timestamp` object. It is particularly useful when working + with time series data that includes timezone information, allowing for easy + access and manipulation of the timezone context. + + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp(1584226800, unit='s', tz='Europe/Stockholm') @@ -2382,7 +2424,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding local time. @@ -2489,7 +2531,7 @@ default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -2593,13 +2635,13 @@ default 'raise' Replace timezone (not a conversion): - >>> import pytz - >>> ts.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> import zoneinfo + >>> ts.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific') Analogous for ``pd.NaT``: - >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> pd.NaT.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) NaT """ diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 10e5790dd1c35..6292b6ce0fd1d 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -119,27 +119,26 @@ cpdef inline object get_timezone(tzinfo tz): raise TypeError("tz argument cannot be None") if is_utc(tz): return tz + elif is_zoneinfo(tz): + return tz.key + elif treat_tz_as_pytz(tz): + zone = tz.zone + if zone is None: + return tz + return zone + elif treat_tz_as_dateutil(tz): + if ".tar.gz" in tz._filename: + raise ValueError( + "Bad tz filename. Dateutil on python 3 on windows has a " + "bug which causes tzfile._filename to be the same for all " + "timezone files. Please construct dateutil timezones " + 'implicitly by passing a string like "dateutil/Europe' + '/London" when you construct your pandas objects instead ' + "of passing a timezone object. See " + "https://github.com/pandas-dev/pandas/pull/7362") + return "dateutil/" + tz._filename else: - if treat_tz_as_dateutil(tz): - if ".tar.gz" in tz._filename: - raise ValueError( - "Bad tz filename. Dateutil on python 3 on windows has a " - "bug which causes tzfile._filename to be the same for all " - "timezone files. Please construct dateutil timezones " - 'implicitly by passing a string like "dateutil/Europe' - '/London" when you construct your pandas objects instead ' - "of passing a timezone object. See " - "https://github.com/pandas-dev/pandas/pull/7362") - return "dateutil/" + tz._filename - else: - # tz is a pytz timezone or unknown. - try: - zone = tz.zone - if zone is None: - return tz - return zone - except AttributeError: - return tz + return tz cpdef inline tzinfo maybe_get_tz(object tz): diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 6365c030b695b..5b9ee095d4643 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1813,6 +1813,9 @@ def ewm(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, if normalize: # avoid numerical errors on constant series if weighted != cur: + if not adjust and com == 1: + # update in case of irregular-interval series + new_wt = 1. - old_wt weighted = old_wt * weighted + new_wt * cur weighted /= (old_wt + new_wt) if adjust: diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 85d03ea17bf42..1cd91ee5b120c 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -6,7 +6,6 @@ from sys import byteorder from typing import ( TYPE_CHECKING, - Callable, ContextManager, cast, ) @@ -85,6 +84,8 @@ from pandas.core.construction import extract_array if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( Dtype, NpDtype, @@ -106,6 +107,7 @@ COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] STRING_DTYPES: list[Dtype] = [str, "str", "U"] +COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"] TIMEDELTA64_DTYPES: list[Dtype] = ["timedelta64[ns]", "m8[ns]"] diff --git a/pandas/_testing/_hypothesis.py b/pandas/_testing/_hypothesis.py index b7fc175b10d17..bbad21d8ab8d1 100644 --- a/pandas/_testing/_hypothesis.py +++ b/pandas/_testing/_hypothesis.py @@ -6,7 +6,6 @@ from hypothesis import strategies as st from hypothesis.extra.dateutil import timezones as dateutil_timezones -from hypothesis.extra.pytz import timezones as pytz_timezones from pandas.compat import is_platform_windows @@ -57,7 +56,7 @@ DATETIME_JAN_1_1900_OPTIONAL_TZ = st.datetimes( min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), # pyright: ignore[reportArgumentType] max_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), # pyright: ignore[reportArgumentType] - timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), + timezones=st.one_of(st.none(), dateutil_timezones(), st.timezones()), ) DATETIME_IN_PD_TIMESTAMP_RANGE_NO_TZ = st.datetimes( diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 2955108d3db1a..e1841c95dcdfe 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -7,21 +7,18 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import uuid import zipfile -from pandas.compat import ( - get_bz2_file, - get_lzma_file, -) from pandas.compat._optional import import_optional_dependency import pandas as pd from pandas._testing.contexts import ensure_clean if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( FilePath, ReadPickleBuffer, @@ -129,11 +126,15 @@ def write_to_compressed(compression, path, data, dest: str = "test") -> None: elif compression == "gzip": compress_method = gzip.GzipFile elif compression == "bz2": - compress_method = get_bz2_file() + import bz2 + + compress_method = bz2.BZ2File elif compression == "zstd": compress_method = import_optional_dependency("zstandard").open elif compression == "xz": - compress_method = get_lzma_file() + import lzma + + compress_method = lzma.LZMAFile else: raise ValueError(f"Unrecognized compression type: {compression}") diff --git a/pandas/_typing.py b/pandas/_typing.py index ef68018f2721a..09a3f58d6ab7f 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterator, Mapping, @@ -18,7 +19,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, Optional, Protocol, @@ -90,18 +90,12 @@ # Name "npt._ArrayLikeInt_co" is not defined [name-defined] NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined] - from typing import SupportsIndex - - if sys.version_info >= (3, 10): - from typing import Concatenate # pyright: ignore[reportUnusedImport] - from typing import ParamSpec - from typing import TypeGuard # pyright: ignore[reportUnusedImport] - else: - from typing_extensions import ( # pyright: ignore[reportUnusedImport] - Concatenate, - ParamSpec, - TypeGuard, - ) + from typing import ( + ParamSpec, + SupportsIndex, + ) + from typing import Concatenate # pyright: ignore[reportUnusedImport] + from typing import TypeGuard # pyright: ignore[reportUnusedImport] P = ParamSpec("P") @@ -516,6 +510,7 @@ def closed(self) -> bool: # ExcelWriter ExcelWriterIfSheetExists = Literal["error", "new", "replace", "overlay"] +ExcelWriterMergeCells = Union[bool, Literal["columns"]] # Offsets OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"] diff --git a/pandas/_version.py b/pandas/_version.py index 7bd9da2bb1cfa..b32c9e67fdbb6 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -10,13 +10,13 @@ """Git implementation of _version.py.""" +from collections.abc import Callable import errno import functools import os import re import subprocess import sys -from typing import Callable def get_keywords(): diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 4583e7edebbdc..e08da7c7e14e3 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -18,13 +18,11 @@ from pandas.compat._constants import ( IS64, ISMUSL, - PY310, PY311, PY312, PYPY, WASM, ) -import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( pa_version_under10p1, @@ -33,6 +31,7 @@ pa_version_under14p0, pa_version_under14p1, pa_version_under16p0, + pa_version_under17p0, ) if TYPE_CHECKING: @@ -148,52 +147,6 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: - """ - Importing the `LZMAFile` class from the `lzma` module. - - Returns - ------- - class - The `LZMAFile` class from the `lzma` module. - - Raises - ------ - RuntimeError - If the `lzma` module was not imported correctly, or didn't exist. - """ - if not pandas.compat.compressors.has_lzma: - raise RuntimeError( - "lzma module not available. " - "A Python re-install with the proper dependencies, " - "might be required to solve this issue." - ) - return pandas.compat.compressors.LZMAFile - - -def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: - """ - Importing the `BZ2File` class from the `bz2` module. - - Returns - ------- - class - The `BZ2File` class from the `bz2` module. - - Raises - ------ - RuntimeError - If the `bz2` module was not imported correctly, or didn't exist. - """ - if not pandas.compat.compressors.has_bz2: - raise RuntimeError( - "bz2 module not available. " - "A Python re-install with the proper dependencies, " - "might be required to solve this issue." - ) - return pandas.compat.compressors.BZ2File - - __all__ = [ "is_numpy_dev", "pa_version_under10p1", @@ -202,9 +155,9 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under14p0", "pa_version_under14p1", "pa_version_under16p0", + "pa_version_under17p0", "IS64", "ISMUSL", - "PY310", "PY311", "PY312", "PYPY", diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py index 2625389e5254a..c7b7341013251 100644 --- a/pandas/compat/_constants.py +++ b/pandas/compat/_constants.py @@ -13,7 +13,6 @@ IS64 = sys.maxsize > 2**32 -PY310 = sys.version_info >= (3, 10) PY311 = sys.version_info >= (3, 11) PY312 = sys.version_info >= (3, 12) PYPY = platform.python_implementation() == "PyPy" @@ -24,7 +23,6 @@ __all__ = [ "IS64", "ISMUSL", - "PY310", "PY311", "PY312", "PYPY", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index f4e717c26d6fd..b62a4c8dcc8c8 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: import types -# Update install.rst, actions-39-minimum_versions.yaml, +# Update install.rst, actions-310-minimum_versions.yaml, # deps_minimum.toml & pyproject.toml when updating versions! VERSIONS = { diff --git a/pandas/compat/compressors.py b/pandas/compat/compressors.py deleted file mode 100644 index 1f31e34c092c9..0000000000000 --- a/pandas/compat/compressors.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Patched ``BZ2File`` and ``LZMAFile`` to handle pickle protocol 5. -""" - -from __future__ import annotations - -from pickle import PickleBuffer - -from pandas.compat._constants import PY310 - -try: - import bz2 - - has_bz2 = True -except ImportError: - has_bz2 = False - -try: - import lzma - - has_lzma = True -except ImportError: - has_lzma = False - - -def flatten_buffer( - b: bytes | bytearray | memoryview | PickleBuffer, -) -> bytes | bytearray | memoryview: - """ - Return some 1-D `uint8` typed buffer. - - Coerces anything that does not match that description to one that does - without copying if possible (otherwise will copy). - """ - - if isinstance(b, (bytes, bytearray)): - return b - - if not isinstance(b, PickleBuffer): - b = PickleBuffer(b) - - try: - # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy - return b.raw() - except BufferError: - # perform in-memory copy if buffer is not contiguous - return memoryview(b).tobytes("A") - - -if has_bz2: - - class BZ2File(bz2.BZ2File): - if not PY310: - - def write(self, b) -> int: - # Workaround issue where `bz2.BZ2File` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) - - -if has_lzma: - - class LZMAFile(lzma.LZMAFile): - if not PY310: - - def write(self, b) -> int: - # Workaround issue where `lzma.LZMAFile` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 54a12c76a230b..2fab8f32b8e71 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -12,7 +12,7 @@ np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") -np_version_gt2 = _nlv >= Version("2.0.0.dev0") +np_version_gt2 = _nlv >= Version("2.0.0") is_numpy_dev = _nlv.dev is not None _min_numpy_ver = "1.23.5" diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 5a96e5a4cc49a..87d3dc86cee87 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -16,6 +16,7 @@ pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") + pa_version_under17p0 = _palv < Version("17.0.0") except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -25,3 +26,4 @@ pa_version_under14p1 = True pa_version_under15p0 = True pa_version_under16p0 = True + pa_version_under17p0 = True diff --git a/pandas/conftest.py b/pandas/conftest.py index 163c3890a7f6d..70e729dfb98a4 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -32,10 +32,7 @@ import gc import operator import os -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import uuid from dateutil.tz import ( @@ -83,6 +80,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, ) @@ -1450,6 +1448,21 @@ def complex_dtype(request): return request.param +@pytest.fixture(params=tm.COMPLEX_FLOAT_DTYPES) +def complex_or_float_dtype(request): + """ + Parameterized fixture for complex and numpy float dtypes. + + * complex + * 'complex64' + * 'complex128' + * float + * 'float32' + * 'float64' + """ + return request.param + + @pytest.fixture(params=tm.SIGNED_INT_NUMPY_DTYPES) def any_signed_int_numpy_dtype(request): """ diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 0a26acb7df60a..3f3ebe8dbe023 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -4,16 +4,18 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) if TYPE_CHECKING: + from collections.abc import Callable from pandas._typing import Scalar import numpy as np from pandas.compat._optional import import_optional_dependency +from pandas.core.util.numba_ import jit_user_function + @functools.cache def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): @@ -21,10 +23,10 @@ def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): import numba else: numba = import_optional_dependency("numba") - nb_compat_func = numba.extending.register_jitable(func) + nb_compat_func = jit_user_function(func) @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def nb_looper(values, axis): + def nb_looper(values, axis, *args): # Operate on the first row/col in order to get # the output shape if axis == 0: @@ -33,7 +35,7 @@ def nb_looper(values, axis): else: first_elem = values[0] dim0 = values.shape[0] - res0 = nb_compat_func(first_elem) + res0 = nb_compat_func(first_elem, *args) # Use np.asarray to get shape for # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape @@ -44,11 +46,11 @@ def nb_looper(values, axis): if axis == 1: buff[0] = res0 for i in numba.prange(1, values.shape[0]): - buff[i] = nb_compat_func(values[i]) + buff[i] = nb_compat_func(values[i], *args) else: buff[:, 0] = res0 for j in numba.prange(1, values.shape[1]): - buff[:, j] = nb_compat_func(values[:, j]) + buff[:, j] = nb_compat_func(values[:, j], *args) return buff return nb_looper diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 3acbfc3eabbac..d8463fda34caa 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, - Callable, final, ) import warnings @@ -18,6 +17,8 @@ from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import TypeT from pandas import Index diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0d97f8a298fdb..948836bf6a51d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -68,6 +68,7 @@ ABCExtensionArray, ABCIndex, ABCMultiIndex, + ABCNumpyExtensionArray, ABCSeries, ABCTimedeltaArray, ) @@ -222,13 +223,17 @@ def _ensure_arraylike(values, func_name: str) -> ArrayLike: """ ensure that we are arraylike if not already """ - if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): + if not isinstance( + values, + (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray, ABCNumpyExtensionArray), + ): # GH#52986 if func_name != "isin-targets": # Make an exception for the comps argument in isin. raise TypeError( f"{func_name} requires a Series, Index, " - f"ExtensionArray, or np.ndarray, got {type(values).__name__}." + f"ExtensionArray, np.ndarray or NumpyExtensionArray " + f"got {type(values).__name__}." ) inferred = lib.infer_dtype(values, skipna=False) @@ -324,7 +329,7 @@ def unique(values): Returns ------- - numpy.ndarray or ExtensionArray + numpy.ndarray, ExtensionArray or NumpyExtensionArray The return can be: @@ -332,7 +337,7 @@ def unique(values): * Categorical : when the input is a Categorical dtype * ndarray : when the input is a Series/ndarray - Return numpy.ndarray or ExtensionArray. + Return numpy.ndarray, ExtensionArray or NumpyExtensionArray. See Also -------- @@ -404,6 +409,13 @@ def unique(values): >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) + + An NumpyExtensionArray of complex + + >>> pd.unique(pd.array([1 + 1j, 2, 3])) + + [(1+1j), (2+0j), (3+0j)] + Length: 3, dtype: complex128 """ return unique_with_mask(values) @@ -1161,11 +1173,14 @@ def take( ... ) array([ 10, 10, -10]) """ - if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): + if not isinstance( + arr, + (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries, ABCNumpyExtensionArray), + ): # GH#52981 raise TypeError( - "pd.api.extensions.take requires a numpy.ndarray, " - f"ExtensionArray, Index, or Series, got {type(arr).__name__}." + "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, " + f"Index, Series, or NumpyExtensionArray got {type(arr).__name__}." ) indices = ensure_platform_int(indices) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2039386c4766c..607a65598783f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -2,13 +2,13 @@ import abc from collections import defaultdict +from collections.abc import Callable import functools from functools import partial import inspect from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, ) @@ -51,6 +51,10 @@ from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.util.numba_ import ( + get_jit_arguments, + prepare_function_arguments, +) if TYPE_CHECKING: from collections.abc import ( @@ -70,7 +74,6 @@ from pandas.core.resample import Resampler from pandas.core.window.rolling import BaseWindow - ResType = dict[int, Any] @@ -997,17 +1000,20 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - engine_kwargs = {} if engine_kwargs is None else engine_kwargs - + args, kwargs = prepare_function_arguments( + self.func, # type: ignore[arg-type] + self.args, + self.kwargs, + ) # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has # incompatible type "Callable[..., Any] | str | list[Callable # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( self.func, # type: ignore[arg-type] - **engine_kwargs, + **get_jit_arguments(engine_kwargs, kwargs), ) - result = nb_looper(self.values, self.axis) + result = nb_looper(self.values, self.axis, *args) # If we made the result 2-D, squeeze it back to 1-D result = np.squeeze(result) else: @@ -1148,21 +1154,23 @@ def generate_numba_apply_func( # Currently the parallel argument doesn't get passed through here # (it's disabled) since the dicts in numba aren't thread-safe. @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names, df_index): + def numba_func(values, col_names, df_index, *args): results = {} for j in range(values.shape[1]): # Create the series ser = Series( values[:, j], index=df_index, name=maybe_cast_str(col_names[j]) ) - results[j] = jitted_udf(ser) + results[j] = jitted_udf(ser, *args) return results return numba_func def apply_with_numba(self) -> dict[int, Any]: + func = cast(Callable, self.func) + args, kwargs = prepare_function_arguments(func, self.args, self.kwargs) nb_func = self.generate_numba_apply_func( - cast(Callable, self.func), **self.engine_kwargs + func, **get_jit_arguments(self.engine_kwargs, kwargs) ) from pandas.core._numba.extensions import set_numba_data @@ -1177,7 +1185,7 @@ def apply_with_numba(self) -> dict[int, Any]: # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict with set_numba_data(index) as index, set_numba_data(columns) as columns: - res = dict(nb_func(self.values, columns, index)) + res = dict(nb_func(self.values, columns, index, *args)) return res @property @@ -1285,7 +1293,7 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names_index, index): + def numba_func(values, col_names_index, index, *args): results = {} # Currently the parallel argument doesn't get passed through here # (it's disabled) since the dicts in numba aren't thread-safe. @@ -1297,15 +1305,17 @@ def numba_func(values, col_names_index, index): index=col_names_index, name=maybe_cast_str(index[i]), ) - results[i] = jitted_udf(ser) + results[i] = jitted_udf(ser, *args) return results return numba_func def apply_with_numba(self) -> dict[int, Any]: + func = cast(Callable, self.func) + args, kwargs = prepare_function_arguments(func, self.args, self.kwargs) nb_func = self.generate_numba_apply_func( - cast(Callable, self.func), **self.engine_kwargs + func, **get_jit_arguments(self.engine_kwargs, kwargs) ) from pandas.core._numba.extensions import set_numba_data @@ -1316,7 +1326,7 @@ def apply_with_numba(self) -> dict[int, Any]: set_numba_data(self.obj.index) as index, set_numba_data(self.columns) as columns, ): - res = dict(nb_func(self.values, columns, index)) + res = dict(nb_func(self.values, columns, index, *args)) return res diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py index c3a7c2e4fefb2..bc10dbfbec90d 100644 --- a/pandas/core/array_algos/datetimelike_accumulations.py +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import Callable +from typing import TYPE_CHECKING import numpy as np @@ -12,6 +12,9 @@ from pandas.core.dtypes.missing import isna +if TYPE_CHECKING: + from collections.abc import Callable + def _cum_func( func: Callable, diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index b31d32a606eed..b4e116388b85e 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -5,14 +5,13 @@ from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import numpy as np if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import npt diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 3784689995802..f2a32fbe2b0e5 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -5,10 +5,7 @@ from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings import numpy as np @@ -18,6 +15,8 @@ from pandas.core.nanops import check_below_min_count if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( AxisInt, npt, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3d55513ab914c..943656ba48432 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -18,7 +17,6 @@ from pandas._libs import lib from pandas._libs.tslibs import ( - NaT, Timedelta, Timestamp, timezones, @@ -175,7 +173,10 @@ def floordiv_compat( } if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._libs.missing import NAType from pandas._typing import ( @@ -2612,17 +2613,19 @@ def _str_wrap(self, width: int, **kwargs) -> Self: @property def _dt_days(self) -> Self: return type(self)( - pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) + pa.array( + self._to_timedeltaarray().components.days, + from_pandas=True, + type=pa.int32(), + ) ) @property def _dt_hours(self) -> Self: return type(self)( pa.array( - [ - td.components.hours if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.hours, + from_pandas=True, type=pa.int32(), ) ) @@ -2631,10 +2634,8 @@ def _dt_hours(self) -> Self: def _dt_minutes(self) -> Self: return type(self)( pa.array( - [ - td.components.minutes if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.minutes, + from_pandas=True, type=pa.int32(), ) ) @@ -2643,7 +2644,9 @@ def _dt_minutes(self) -> Self: def _dt_seconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() + self._to_timedeltaarray().components.seconds, + from_pandas=True, + type=pa.int32(), ) ) @@ -2651,10 +2654,8 @@ def _dt_seconds(self) -> Self: def _dt_milliseconds(self) -> Self: return type(self)( pa.array( - [ - td.components.milliseconds if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.milliseconds, + from_pandas=True, type=pa.int32(), ) ) @@ -2663,7 +2664,7 @@ def _dt_milliseconds(self) -> Self: def _dt_microseconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().microseconds, + self._to_timedeltaarray().components.microseconds, from_pandas=True, type=pa.int32(), ) @@ -2673,7 +2674,9 @@ def _dt_microseconds(self) -> Self: def _dt_nanoseconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() + self._to_timedeltaarray().components.nanoseconds, + from_pandas=True, + type=pa.int32(), ) ) @@ -2791,7 +2794,10 @@ def _dt_days_in_month(self) -> Self: @property def _dt_microsecond(self) -> Self: - return type(self)(pc.microsecond(self._pa_array)) + # GH 59154 + us = pc.microsecond(self._pa_array) + ms_to_us = pc.multiply(pc.millisecond(self._pa_array), 1000) + return type(self)(pc.add(us, ms_to_us)) @property def _dt_minute(self) -> Self: @@ -2970,7 +2976,7 @@ def transpose_homogeneous_pyarrow( """ arrays = list(arrays) nrows, ncols = len(arrays[0]), len(arrays) - indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten() + indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.reshape(-1) arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks]) arr = arr.take(indices) return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)] diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f83fdcd46b371..1e8fec7fde3de 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -13,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, cast, @@ -78,6 +77,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index a326925545045..74c0cd7719c13 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -329,15 +329,21 @@ def _from_sequence_of_strings( copy: bool = False, true_values: list[str] | None = None, false_values: list[str] | None = None, + none_values: list[str] | None = None, ) -> BooleanArray: true_values_union = cls._TRUE_VALUES.union(true_values or []) false_values_union = cls._FALSE_VALUES.union(false_values or []) - def map_string(s) -> bool: + if none_values is None: + none_values = [] + + def map_string(s) -> bool | None: if s in true_values_union: return True elif s in false_values_union: return False + elif s in none_values: + return None else: raise ValueError(f"{s} cannot be cast to bool") diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 64e5eec43a5c1..18b52f741370f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,12 +6,10 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, - Callable, Literal, cast, overload, ) -import warnings import numpy as np @@ -24,7 +22,6 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas.compat.numpy import function as nv -from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -94,6 +91,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, Sequence, @@ -2673,62 +2671,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) - @overload - def _replace(self, *, to_replace, value, inplace: Literal[False] = ...) -> Self: ... - - @overload - def _replace(self, *, to_replace, value, inplace: Literal[True]) -> None: ... - - def _replace(self, *, to_replace, value, inplace: bool = False) -> Self | None: - from pandas import Index - - orig_dtype = self.dtype - - inplace = validate_bool_kwarg(inplace, "inplace") - cat = self if inplace else self.copy() - - mask = isna(np.asarray(value)) - if mask.any(): - removals = np.asarray(to_replace)[mask] - removals = cat.categories[cat.categories.isin(removals)] - new_cat = cat.remove_categories(removals) - NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype) - - ser = cat.categories.to_series() - ser = ser.replace(to_replace=to_replace, value=value) - - all_values = Index(ser) - - # GH51016: maintain order of existing categories - idxr = cat.categories.get_indexer_for(all_values) - locs = np.arange(len(ser)) - locs = np.where(idxr == -1, locs, idxr) - locs = locs.argsort() - - new_categories = ser.take(locs) - new_categories = new_categories.drop_duplicates(keep="first") - index_categories = Index(new_categories) - new_codes = recode_for_categories( - cat._codes, all_values, index_categories, copy=False - ) - new_dtype = CategoricalDtype(index_categories, ordered=self.dtype.ordered) - NDArrayBacked.__init__(cat, new_codes, new_dtype) - - if new_dtype != orig_dtype: - warnings.warn( - # GH#55147 - "The behavior of Series.replace (and DataFrame.replace) with " - "CategoricalDtype is deprecated. In a future version, replace " - "will only be used for cases that preserve the categories. " - "To change the categories, use ser.cat.rename_categories " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if not inplace: - return cat - return None - # ------------------------------------------------------------------------ # String methods interface def _str_map( diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 673001337767b..c90ff410b4b93 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, Union, cast, @@ -148,6 +147,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 077bde35a4c94..34d25f04b69e1 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -594,7 +594,7 @@ def tz(self) -> tzinfo | None: Returns ------- - datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None + zoneinfo.ZoneInfo,, datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None Returns None when the array is tz-naive. See Also @@ -624,7 +624,7 @@ def tz(self) -> tzinfo | None: ... ) >>> idx.tz datetime.timezone.utc - """ + """ # noqa: E501 # GH 18595 return getattr(self.dtype, "tz", None) @@ -863,7 +863,7 @@ def tz_convert(self, tz) -> Self: Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None Time zone for time. Corresponding timestamps would be converted to this time zone of the Datetime Array/Index. A `tz` of None will convert to UTC and remove the timezone information. @@ -923,7 +923,7 @@ def tz_convert(self, tz) -> Self: '2014-08-01 08:00:00', '2014-08-01 09:00:00'], dtype='datetime64[ns]', freq='h') - """ + """ # noqa: E501 tz = timezones.maybe_get_tz(tz) if self.tz is None: @@ -955,7 +955,7 @@ def tz_localize( Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + tz : str, zoneinfo.ZoneInfo,, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None Time zone to convert timestamps to. Passing ``None`` will remove the time zone information preserving local time. ambiguous : 'infer', 'NaT', bool array, default 'raise' @@ -1081,7 +1081,7 @@ def tz_localize( 0 2015-03-29 03:30:00+02:00 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] - """ + """ # noqa: E501 nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( nonexistent, timedelta @@ -2119,6 +2119,32 @@ def isocalendar(self) -> DataFrame: >>> idx.is_year_start array([False, False, True]) + + This method, when applied to Series with datetime values under + the ``.dt`` accessor, will lose information about Business offsets. + + >>> dates = pd.Series(pd.date_range("2020-10-30", periods=4, freq="BYS")) + >>> dates + 0 2021-01-01 + 1 2022-01-03 + 2 2023-01-02 + 3 2024-01-01 + dtype: datetime64[ns] + + >>> dates.dt.is_year_start + 0 True + 1 False + 2 False + 3 True + dtype: bool + + >>> idx = pd.date_range("2020-10-30", periods=4, freq="BYS") + >>> idx + DatetimeIndex(['2021-01-01', '2022-01-03', '2023-01-02', '2024-01-01'], + dtype='datetime64[ns]', freq='BYS-JAN') + + >>> idx.is_year_start + array([ True, True, True, True]) """, ) is_year_end = _field_accessor( diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2e1ea7236e5c4..52d64162358c8 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -8,7 +8,6 @@ import textwrap from typing import ( TYPE_CHECKING, - Callable, Literal, Union, overload, @@ -99,6 +98,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 04cffcaaa5f04..93471788e72ab 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -3,7 +3,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -73,6 +72,7 @@ from pandas.core.util.hashing import hash_array if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Iterator, Sequence, diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index c5e9ed8698ffe..2c0236273e731 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -28,7 +27,10 @@ ) if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import ( + Callable, + Mapping, + ) import pyarrow diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8baf363b909fb..e762c3e547819 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -5,7 +5,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, TypeVar, cast, @@ -75,7 +74,10 @@ import pandas.core.common as com if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( AnyArrayLike, diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 6a1c25711acb0..b8245349a4e62 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -98,8 +98,8 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 2.], @@ -186,8 +186,8 @@ def to_coo( ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 3.], [3., 0., 0., 0.], @@ -291,12 +291,12 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: Examples -------- >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3, dtype=float) + >>> mat = scipy.sparse.eye(3, dtype=int) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 - 0 1.0 0 0 - 1 0 1.0 0 - 2 0 0 1.0 + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 """ from pandas._libs.sparse import IntIndex @@ -313,7 +313,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: indices = data.indices indptr = data.indptr array_data = data.data - dtype = SparseDtype(array_data.dtype, 0) + dtype = SparseDtype(array_data.dtype) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) @@ -380,8 +380,8 @@ def to_coo(self) -> spmatrix: -------- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) >>> df.sparse.to_coo() - <4x1 sparse matrix of type '' - with 2 stored elements in COOrdinate format> + """ import_optional_dependency("scipy") from scipy.sparse import coo_matrix @@ -393,8 +393,6 @@ def to_coo(self) -> spmatrix: cols, rows, data = [], [], [] for col, (_, ser) in enumerate(self._parent.items()): sp_arr = ser.array - if sp_arr.fill_value != 0: - raise ValueError("fill value must be 0 when converting to COO matrix") row = sp_arr.sp_index.indices cols.append(np.repeat(col, len(row))) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index adf8f44377e62..3a08344369822 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -10,7 +10,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -87,7 +86,10 @@ # See https://github.com/python/typing/issues/684 if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from enum import Enum class ellipsis(Enum): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f2fd9d5d6610f..97c06149d0b7e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -5,7 +5,6 @@ import re from typing import ( TYPE_CHECKING, - Callable, Union, cast, ) @@ -53,7 +52,10 @@ if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( ArrayLike, diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 865e81d7754ef..15bfe442ca87f 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -746,7 +746,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: -------- **Series** - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="D")) >>> s 0 0 days 1 1 days @@ -765,7 +765,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: **TimedeltaIndex** - >>> idx = pd.to_timedelta(np.arange(5), unit="d") + >>> idx = pd.to_timedelta(np.arange(5), unit="D") >>> idx TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) @@ -809,7 +809,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: -------- For Series: - >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='d')) + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='D')) >>> ser 0 1 days 1 2 days diff --git a/pandas/core/common.py b/pandas/core/common.py index 96291991227d9..ec0473a20458b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -12,6 +12,7 @@ defaultdict, ) from collections.abc import ( + Callable, Collection, Generator, Hashable, @@ -24,7 +25,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, TypeVar, cast, overload, @@ -145,7 +145,7 @@ def is_bool_indexer(key: Any) -> bool: elif isinstance(key, list): # check if np.array(key).dtype would be bool if len(key) > 0: - if type(key) is not list: # noqa: E721 + if type(key) is not list: # GH#42461 cython will raise TypeError if we pass a subclass key = list(key) return lib.is_bool_list(key) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index b4e33b8ac75cb..7de4d8cdf99e1 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -8,10 +8,7 @@ partial, wraps, ) -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings import numpy as np @@ -31,7 +28,10 @@ from pandas.core.computation.common import result_type_many if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import F diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index fee08c6199eef..aad768d31483a 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -193,8 +193,11 @@ def eval( corresponding bitwise operators. :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. - `eval` can run arbitrary code which can make you vulnerable to code - injection if you pass user input to this function. + + .. warning:: + + ``eval`` can run arbitrary code which can make you vulnerable to code + injection and untrusted data. Parameters ---------- diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index a8123a898b4fe..b074e768e0842 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -12,7 +12,7 @@ from keyword import iskeyword import tokenize from typing import ( - Callable, + TYPE_CHECKING, ClassVar, TypeVar, ) @@ -32,7 +32,6 @@ UNARY_OPS_SYMS, BinOp, Constant, - Div, FuncNode, Op, Term, @@ -47,6 +46,9 @@ from pandas.io.formats import printing +if TYPE_CHECKING: + from collections.abc import Callable + def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]: """ @@ -371,7 +373,7 @@ class BaseExprVisitor(ast.NodeVisitor): "Add", "Sub", "Mult", - None, + "Div", "Pow", "FloorDiv", "Mod", @@ -534,9 +536,6 @@ def visit_BinOp(self, node, **kwargs): left, right = self._maybe_downcast_constants(left, right) return self._maybe_evaluate_binop(op, op_class, left, right) - def visit_Div(self, node, **kwargs): - return lambda lhs, rhs: Div(lhs, rhs) - def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) operand = self.visit(node.operand) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index b7a1cb173f659..a1a5f77f8539e 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -9,7 +9,6 @@ import operator from typing import ( TYPE_CHECKING, - Callable, Literal, ) @@ -36,6 +35,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterable, Iterator, ) @@ -327,31 +327,6 @@ def _not_in(x, y): _binary_ops_dict.update(d) -def _cast_inplace(terms, acceptable_dtypes, dtype) -> None: - """ - Cast an expression inplace. - - Parameters - ---------- - terms : Op - The expression that should cast. - acceptable_dtypes : list of acceptable numpy.dtype - Will not cast if term's dtype in this list. - dtype : str or numpy.dtype - The dtype to cast to. - """ - dt = np.dtype(dtype) - for term in terms: - if term.type in acceptable_dtypes: - continue - - try: - new_value = term.value.astype(dt) - except AttributeError: - new_value = dt.type(term.value) - term.update(new_value) - - def is_term(obj) -> bool: return isinstance(obj, Term) @@ -508,34 +483,6 @@ def _disallow_scalar_only_bool_ops(self) -> None: raise NotImplementedError("cannot evaluate scalar only bool ops") -def isnumeric(dtype) -> bool: - return issubclass(np.dtype(dtype).type, np.number) - - -class Div(BinOp): - """ - Div operator to special case casting. - - Parameters - ---------- - lhs, rhs : Term or Op - The Terms or Ops in the ``/`` expression. - """ - - def __init__(self, lhs, rhs) -> None: - super().__init__("/", lhs, rhs) - - if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): - raise TypeError( - f"unsupported operand type(s) for {self.op}: " - f"'{lhs.return_type}' and '{rhs.return_type}'" - ) - - # do not upcast float32s to float64 un-necessarily - acceptable_dtypes = [np.float32, np.float64] - _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) - - UNARY_OPS_SYMS = ("+", "-", "~", "not") _unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) _unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 46c9139c3456c..05661033bd5ed 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -12,8 +12,8 @@ from __future__ import annotations +from collections.abc import Callable import os -from typing import Callable import pandas._config.config as cf from pandas._config.config import ( diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 662b8c5791e51..f2af69fcc9d84 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -39,7 +39,6 @@ is_supported_dtype, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 -from pandas.compat.numpy import np_version_gt2 from pandas.errors import ( IntCastingNaNError, LossySetitemError, @@ -1643,13 +1642,11 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n with warnings.catch_warnings(): # We already disallow dtype=uint w/ negative numbers # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. - if not np_version_gt2: - warnings.filterwarnings( - "ignore", - "NumPy will stop allowing conversion of " - "out-of-bound Python int", - DeprecationWarning, - ) + warnings.filterwarnings( + "ignore", + "NumPy will stop allowing conversion of " "out-of-bound Python int", + DeprecationWarning, + ) casted = np.asarray(arr, dtype=dtype) else: with warnings.catch_warnings(): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2ac75a0700759..bee8af46baa64 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import warnings @@ -55,6 +54,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( ArrayLike, DtypeObj, diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 5213be8b69016..3aeab96e03163 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1666,7 +1666,7 @@ class SparseDtype(ExtensionDtype): """ Dtype for data stored in :class:`SparseArray`. - `SparseDtype` is used as the data type for :class:`SparseArray`, enabling + ``SparseDtype`` is used as the data type for :class:`SparseArray`, enabling more efficient storage of data that contains a significant number of repetitive values typically represented by a fill value. It supports any scalar dtype as the underlying data type of the non-fill values. @@ -1677,19 +1677,20 @@ class SparseDtype(ExtensionDtype): The dtype of the underlying array storing the non-fill value values. fill_value : scalar, optional The scalar value not stored in the SparseArray. By default, this - depends on `dtype`. + depends on ``dtype``. =========== ========== dtype na_value =========== ========== float ``np.nan`` + complex ``np.nan`` int ``0`` bool ``False`` datetime64 ``pd.NaT`` timedelta64 ``pd.NaT`` =========== ========== - The default value may be overridden by specifying a `fill_value`. + The default value may be overridden by specifying a ``fill_value``. Attributes ---------- diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f0e21136f8a97..b9cd6ae2f13e8 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -618,6 +618,8 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): nan >>> na_value_for_dtype(np.dtype("float64")) nan + >>> na_value_for_dtype(np.dtype("complex128")) + nan >>> na_value_for_dtype(np.dtype("bool")) False >>> na_value_for_dtype(np.dtype("datetime64[ns]")) @@ -629,7 +631,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): elif dtype.kind in "mM": unit = np.datetime_data(dtype)[0] return dtype.type("NaT", unit) - elif dtype.kind == "f": + elif dtype.kind in "fc": return np.nan elif dtype.kind in "iu": if compat: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a6c0e1e372530..5ef663564a016 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -14,6 +14,7 @@ import collections from collections import abc from collections.abc import ( + Callable, Hashable, Iterable, Iterator, @@ -29,7 +30,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -4782,6 +4782,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: ValueError * If both of ``include`` and ``exclude`` are empty * If ``include`` and ``exclude`` have overlapping elements + TypeError * If any kind of string dtype is passed in. See Also @@ -11644,7 +11645,7 @@ def all( **kwargs, ) -> Series | bool: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="all") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="all") @doc(make_doc("all", ndim=1)) def all( self, @@ -11691,7 +11692,7 @@ def min( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="min") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="min") @doc(make_doc("min", ndim=2)) def min( self, @@ -11738,7 +11739,7 @@ def max( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="max") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="max") @doc(make_doc("max", ndim=2)) def max( self, @@ -11754,7 +11755,7 @@ def max( result = result.__finalize__(self, method="max") return result - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sum") def sum( self, axis: Axis | None = 0, @@ -11855,7 +11856,7 @@ def sum( result = result.__finalize__(self, method="sum") return result - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="prod") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="prod") def prod( self, axis: Axis | None = 0, @@ -11973,7 +11974,7 @@ def mean( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="mean") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="mean") @doc(make_doc("mean", ndim=2)) def mean( self, @@ -12020,7 +12021,7 @@ def median( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="median") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="median") @doc(make_doc("median", ndim=2)) def median( self, @@ -12070,7 +12071,7 @@ def sem( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sem") def sem( self, axis: Axis | None = 0, @@ -12190,7 +12191,7 @@ def var( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="var") def var( self, axis: Axis | None = 0, @@ -12309,7 +12310,7 @@ def std( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="std") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="std") def std( self, axis: Axis | None = 0, @@ -12432,7 +12433,7 @@ def skew( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="skew") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="skew") def skew( self, axis: Axis | None = 0, @@ -12552,7 +12553,7 @@ def kurt( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="kurt") def kurt( self, axis: Axis | None = 0, @@ -13078,7 +13079,7 @@ def quantile( if len(data.columns) == 0: # GH#23925 _get_numeric_data may have dropped all columns - cols = Index([], name=self.columns.name) + cols = self.columns[:0] dtype = np.float64 if axis == 1: @@ -13327,6 +13328,11 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: Series.str.contains: Test if pattern or regex is contained within a string of a Series or Index. + Notes + ----- + ``__iter__`` is used (and not ``__contains__``) to iterate over values + when checking if it contains the elements in DataFrame. + Examples -------- >>> df = pd.DataFrame( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 84745b25b5eef..2a0495dff6681 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -13,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, NoReturn, @@ -158,7 +157,6 @@ Index, MultiIndex, PeriodIndex, - RangeIndex, default_index, ensure_index, ) @@ -186,6 +184,7 @@ from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Hashable, Iterator, @@ -1852,7 +1851,7 @@ def _drop_labels_or_levels(self, keys, axis: AxisInt = 0): else: # Drop the last level of Index by replacing with # a RangeIndex - dropped.columns = RangeIndex(dropped.columns.size) + dropped.columns = default_index(dropped.columns.size) # Handle dropping index labels if labels_to_drop: @@ -2388,7 +2387,8 @@ def to_json( index : bool or None, default None The index is only used when 'orient' is 'split', 'index', 'column', or 'table'. Of these, 'index' and 'column' do not support - `index=False`. + `index=False`. The string 'index' as a column name with empty :class:`Index` + or if it is 'index' will raise a ``ValueError``. indent : int, optional Length of whitespace used to indent each record. @@ -2780,7 +2780,8 @@ def to_sql( ---------- name : str Name of SQL table. - con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection + con : ADBC connection, sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. Legacy support is provided for sqlite3.Connection objects. The user is responsible for engine disposal and connection closure for the SQLAlchemy @@ -2967,6 +2968,22 @@ def to_sql( >>> with engine.connect() as conn: ... conn.execute(text("SELECT * FROM integers")).fetchall() [(1,), (None,), (2,)] + + .. versionadded:: 2.2.0 + + pandas now supports writing via ADBC drivers + + >>> df = pd.DataFrame({'name' : ['User 10', 'User 11', 'User 12']}) + >>> df + name + 0 User 10 + 1 User 11 + 2 User 12 + + >>> from adbc_driver_sqlite import dbapi # doctest:+SKIP + >>> with dbapi.connect("sqlite://") as conn: # doctest:+SKIP + ... df.to_sql(name="users", con=conn) + 3 """ # noqa: E501 from pandas.io import sql @@ -5732,7 +5749,7 @@ def sample( replace : bool, default False Allow or disallow sampling of the same row more than once. weights : str or ndarray-like, optional - Default 'None' results in equal probability weighting. + Default ``None`` results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and index values in sampled object not in weights will be assigned @@ -5747,6 +5764,7 @@ def sample( random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. + Default ``None`` results in sampling with the current state of np.random. .. versionchanged:: 1.4.0 @@ -6017,17 +6035,16 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": + objs = other.objs # propagate attrs only if all concat arguments have the same attrs - if all(bool(obj.attrs) for obj in other.objs): + if all(bool(obj.attrs) for obj in objs): # all concatenate arguments have non-empty attrs - attrs = other.objs[0].attrs - have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + attrs = objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in objs[1:]) if have_same_attrs: self.attrs = deepcopy(attrs) - allows_duplicate_labels = all( - x.flags.allows_duplicate_labels for x in other.objs - ) + allows_duplicate_labels = all(x.flags.allows_duplicate_labels for x in objs) self.flags.allows_duplicate_labels = allows_duplicate_labels return self @@ -9271,7 +9288,9 @@ def compare( # reorder axis to keep things organized indices = ( - np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + np.arange(diff.shape[axis]) + .reshape([2, diff.shape[axis] // 2]) + .T.reshape(-1) ) diff = diff.take(indices, axis=axis) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 945b9f9c14c0b..c112d9b6a4b54 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -9,12 +9,12 @@ from __future__ import annotations from collections import abc +from collections.abc import Callable from functools import partial from textwrap import dedent from typing import ( TYPE_CHECKING, Any, - Callable, Literal, NamedTuple, TypeVar, @@ -124,6 +124,10 @@ class NamedAgg(NamedTuple): Function to apply to the provided column. If string, the name of a built-in pandas function. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Examples -------- >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) @@ -682,7 +686,8 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: b 1 dtype: int64 """ - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups val = self.obj._values codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1b58317c08736..bf71bb80b3623 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -10,6 +10,7 @@ class providing the base-class of operations. from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Iterator, @@ -24,7 +25,6 @@ class providing the base-class of operations. from textwrap import dedent from typing import ( TYPE_CHECKING, - Callable, Literal, TypeVar, Union, @@ -128,7 +128,6 @@ class providing the base-class of operations. from pandas.core.indexes.api import ( Index, MultiIndex, - RangeIndex, default_index, ) from pandas.core.internals.blocks import ensure_block_shape @@ -1264,7 +1263,7 @@ def _set_result_index_ordered( if self._grouper.has_dropped_na: # Add back in any missing rows due to dropna - index here is integral # with values referring to the row of the input so can use RangeIndex - result = result.reindex(RangeIndex(len(index)), axis=0) + result = result.reindex(default_index(len(index)), axis=0) result = result.set_axis(index, axis=0) return result @@ -1334,7 +1333,7 @@ def _wrap_aggregated_output( # enforced in __init__ result = self._insert_inaxis_grouper(result, qs=qs) result = result._consolidate() - result.index = RangeIndex(len(result)) + result.index = default_index(len(result)) else: index = self._grouper.result_index @@ -1360,7 +1359,7 @@ def _wrap_applied_output( @final def _numba_prep(self, data: DataFrame): - ids, ngroups = self._grouper.group_info + ngroups = self._grouper.ngroups sorted_index = self._grouper.result_ilocs sorted_ids = self._grouper._sorted_ids @@ -1969,7 +1968,8 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) @@ -2185,7 +2185,8 @@ def count(self) -> NDFrameT: Freq: MS, dtype: int64 """ data = self._get_data_to_aggregate() - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups mask = ids != -1 is_series = data.ndim == 1 @@ -3840,7 +3841,8 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): if limit is None: limit = -1 - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups col_func = partial( libgroupby.group_fillna_indexer, @@ -4361,7 +4363,8 @@ def post_processor( qs = np.array([q], dtype=np.float64) pass_qs = None - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups if self.dropna: # splitter drops NA groups, we need to do the same ids = ids[ids >= 0] @@ -5038,7 +5041,8 @@ def shift( else: if fill_value is lib.no_default: fill_value = None - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups res_indexer = np.zeros(len(ids), dtype=np.int64) libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) @@ -5385,6 +5389,7 @@ def sample( random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. + Default ``None`` results in sampling with the current state of np.random. .. versionchanged:: 1.4.0 @@ -5399,6 +5404,7 @@ def sample( See Also -------- DataFrame.sample: Generate random samples from a DataFrame object. + Series.sample: Generate random samples from a Series object. numpy.random.choice: Generate a random sample from a given 1-D numpy array. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e75a5b9089f5f..5f680de77649f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -34,6 +34,7 @@ from pandas.core.indexes.api import ( Index, MultiIndex, + default_index, ) from pandas.core.series import Series @@ -901,7 +902,7 @@ def is_in_obj(gpr) -> bool: if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") if len(groupings) == 0: - groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) + groupings.append(Grouping(default_index(0), np.array([], dtype=np.intp))) # create the internals grouper grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index b22fc9248eeca..73b681c64c3a3 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -20,6 +19,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import Scalar diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4f40c4f4283f0..da80969b613cd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -12,7 +12,6 @@ import functools from typing import ( TYPE_CHECKING, - Callable, Generic, final, ) @@ -70,10 +69,10 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Hashable, Iterator, - Sequence, ) from pandas.core.generic import NDFrame @@ -581,14 +580,14 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: Sequence[grouper.Grouping], + groupings: list[grouper.Grouping], sort: bool = True, dropna: bool = True, ) -> None: assert isinstance(axis, Index), axis self.axis = axis - self._groupings: list[grouper.Grouping] = list(groupings) + self._groupings = groupings self._sort = sort self.dropna = dropna @@ -596,10 +595,6 @@ def __init__( def groupings(self) -> list[grouper.Grouping]: return self._groupings - @property - def shape(self) -> Shape: - return tuple(ping.ngroups for ping in self.groupings) - def __iter__(self) -> Iterator[Hashable]: return iter(self.indices) @@ -628,11 +623,15 @@ def _get_splitter(self, data: NDFrame) -> DataSplitter: ------- Generator yielding subsetted objects """ - ids, ngroups = self.group_info - return _get_splitter( + if isinstance(data, Series): + klass: type[DataSplitter] = SeriesSplitter + else: + # i.e. DataFrame + klass = FrameSplitter + + return klass( data, - ids, - ngroups, + self.ngroups, sorted_ids=self._sorted_ids, sort_idx=self.result_ilocs, ) @@ -692,7 +691,8 @@ def size(self) -> Series: """ Compute group sizes. """ - ids, ngroups = self.group_info + ids = self.ids + ngroups = self.ngroups out: np.ndarray | list if ngroups: out = np.bincount(ids[ids != -1], minlength=ngroups) @@ -729,12 +729,6 @@ def has_dropped_na(self) -> bool: """ return bool((self.ids < 0).any()) - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], int]: - result_index, ids = self.result_index_and_ids - ngroups = len(result_index) - return ids, ngroups - @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis @@ -1123,10 +1117,6 @@ def indices(self): i = bin return indices - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], int]: - return self.ids, self.ngroups - @cache_readonly def codes(self) -> list[npt.NDArray[np.intp]]: return [self.ids] @@ -1191,29 +1181,25 @@ class DataSplitter(Generic[NDFrameT]): def __init__( self, data: NDFrameT, - labels: npt.NDArray[np.intp], ngroups: int, *, sort_idx: npt.NDArray[np.intp], sorted_ids: npt.NDArray[np.intp], ) -> None: self.data = data - self.labels = ensure_platform_int(labels) # _should_ already be np.intp self.ngroups = ngroups self._slabels = sorted_ids self._sort_idx = sort_idx def __iter__(self) -> Iterator: - sdata = self._sorted_data - if self.ngroups == 0: # we are inside a generator, rather than raise StopIteration # we merely return signal the end return starts, ends = lib.generate_slices(self._slabels, self.ngroups) - + sdata = self._sorted_data for start, end in zip(starts, ends): yield self._chop(sdata, slice(start, end)) @@ -1241,20 +1227,3 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: mgr = sdata._mgr.get_slice(slice_obj, axis=1) df = sdata._constructor_from_mgr(mgr, axes=mgr.axes) return df.__finalize__(sdata, method="groupby") - - -def _get_splitter( - data: NDFrame, - labels: npt.NDArray[np.intp], - ngroups: int, - *, - sort_idx: npt.NDArray[np.intp], - sorted_ids: npt.NDArray[np.intp], -) -> DataSplitter: - if isinstance(data, Series): - klass: type[DataSplitter] = SeriesSplitter - else: - # i.e. DataFrame - klass = FrameSplitter - - return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 3cb51f7447677..e2dc71f68a65b 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -459,7 +459,7 @@ def to_pytimedelta(self) -> np.ndarray: Examples -------- - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="D")) >>> s 0 0 days 1 1 days diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 83e8df5072b92..5144e647e73b4 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -130,7 +130,7 @@ def _get_combined_index( # TODO: handle index names! indexes = _get_distinct_objs(indexes) if len(indexes) == 0: - index = Index([]) + index: Index = default_index(0) elif len(indexes) == 1: index = indexes[0] elif intersect: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 15c318e5e9caf..7d43498d4267b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -8,7 +8,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, NoReturn, @@ -193,6 +192,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Sequence, @@ -7528,7 +7528,7 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: index_like = list(index_like) if isinstance(index_like, list): - if type(index_like) is not list: # noqa: E721 + if type(index_like) is not list: # must check for exactly list here because of strict type # check in clean_index_list index_like = list(index_like) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7e8d808769bc1..e1120466eaf83 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -523,7 +523,7 @@ def _as_range_index(self) -> RangeIndex: # Convert our i8 representations to RangeIndex # Caller is responsible for checking isinstance(self.freq, Tick) freq = cast(Tick, self.freq) - tick = Timedelta(freq).as_unit("ns")._value + tick = Timedelta(freq).as_unit(self.unit)._value rng = range(self[0]._value, self[-1]._value + tick, tick) return RangeIndex(rng) @@ -536,7 +536,9 @@ def _wrap_range_setop(self, other, res_i8) -> Self: # RangeIndex defaults to step=1, which we don't want. new_freq = self.freq elif isinstance(res_i8, RangeIndex): - new_freq = to_offset(Timedelta(res_i8.step)) + new_freq = to_offset( + Timedelta(res_i8.step, unit=self.unit).as_unit(self.unit) + ) # TODO(GH#41493): we cannot just do # type(self._data)(res_i8.values, dtype=self.dtype, freq=new_freq) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index c276750314a34..00a929724ed4c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -147,7 +147,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation. - tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str + tz : zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or str Set the Timezone of the data. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index fc806a3546571..48d5e59250f35 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -7,7 +7,6 @@ from inspect import signature from typing import ( TYPE_CHECKING, - Callable, TypeVar, ) @@ -18,6 +17,8 @@ from pandas.core.indexes.base import Index if TYPE_CHECKING: + from collections.abc import Callable + import numpy as np from pandas._typing import ( diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a8c05ab78c98e..19c94fa4104d7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Collection, Generator, Hashable, @@ -12,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, ) @@ -638,7 +638,6 @@ def from_product( (2, 'purple')], names=['number', 'color']) """ - from pandas.core.reshape.util import cartesian_product if not is_list_like(iterables): raise TypeError("Input must be a list / sequence of iterables.") @@ -1031,6 +1030,13 @@ def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. + See Also + -------- + MultiIndex.levels : Get the levels of the MultiIndex. + MultiIndex.codes : Get the codes of the MultiIndex. + MultiIndex.from_arrays : Convert arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]]) @@ -3899,8 +3905,11 @@ def insert(self, loc: int, item) -> MultiIndex: # have to insert into level # must insert at end otherwise you have to recompute all the # other codes - lev_loc = len(level) - level = level.insert(lev_loc, k) + if isna(k): # GH 59003 + lev_loc = -1 + else: + lev_loc = len(level) + level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) @@ -4095,3 +4104,60 @@ def _require_listlike(level, arr, arrname: str): if not is_list_like(arr) or not is_list_like(arr[0]): raise TypeError(f"{arrname} must be list of lists-like") return level, arr + + +def cartesian_product(X: list[np.ndarray]) -> list[np.ndarray]: + """ + Numpy version of itertools.product. + Sometimes faster (for large inputs)... + + Parameters + ---------- + X : list-like of list-likes + + Returns + ------- + product : list of ndarrays + + Examples + -------- + >>> cartesian_product([list("ABC"), [1, 2]]) + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype=' None: self.obj._mgr.column_setitem( loc, plane_indexer, value, inplace_only=True ) - except (ValueError, TypeError, LossySetitemError): + except (ValueError, TypeError, LossySetitemError) as exc: # If we're setting an entire column and we can't do it inplace, # then we can use value's dtype (or inferred dtype) # instead of object dtype = self.obj.dtypes.iloc[loc] if dtype not in (np.void, object) and not self.obj.empty: # - Exclude np.void, as that is a special case for expansion. - # We want to warn for + # We want to raise for # df = pd.DataFrame({'a': [1, 2]}) # df.loc[:, 'a'] = .3 # but not for @@ -2140,14 +2139,9 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: # - Exclude `object`, as then no upcasting happens. # - Exclude empty initial object with enlargement, # as then there's nothing to be inconsistent with. - warnings.warn( - f"Setting an item of incompatible dtype is deprecated " - "and will raise in a future error of pandas. " - f"Value '{value}' has dtype incompatible with {dtype}, " - "please explicitly cast to a compatible dtype first.", - FutureWarning, - stacklevel=find_stack_level(), - ) + raise TypeError( + f"Invalid value '{value}' for dtype '{dtype}'" + ) from exc self.obj.isetitem(loc, value) else: # set value into the column (first attempting to operate inplace, then @@ -2440,7 +2434,7 @@ def _align_frame(self, indexer, df: DataFrame) -> DataFrame: ax = self.obj.axes[i] if is_sequence(ix) or isinstance(ix, slice): if isinstance(ix, np.ndarray): - ix = ix.ravel() + ix = ix.reshape(-1) if idx is None: idx = ax[ix] elif cols is None: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cffb1f658a640..149bef6258bfa 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -5,7 +5,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, final, @@ -101,7 +100,6 @@ ) from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( - Categorical, DatetimeArray, ExtensionArray, IntervalArray, @@ -121,6 +119,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Iterable, Sequence, @@ -429,7 +428,7 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]: # Up/Down-casting @final - def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: + def coerce_to_target_dtype(self, other, raise_on_upcast: bool) -> Block: """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise @@ -456,7 +455,7 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: isinstance(other, (np.datetime64, np.timedelta64)) and np.isnat(other) ) ): - warn_on_upcast = False + raise_on_upcast = False elif ( isinstance(other, np.ndarray) and other.ndim == 1 @@ -464,17 +463,10 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and is_float_dtype(other.dtype) and lib.has_only_ints_or_nan(other) ): - warn_on_upcast = False - - if warn_on_upcast: - warnings.warn( - f"Setting an item of incompatible dtype is deprecated " - "and will raise an error in a future version of pandas. " - f"Value '{other}' has dtype incompatible with {self.values.dtype}, " - "please explicitly cast to a compatible dtype first.", - FutureWarning, - stacklevel=find_stack_level(), - ) + raise_on_upcast = False + + if raise_on_upcast: + raise TypeError(f"Invalid value '{other}' for dtype '{self.values.dtype}'") if self.values.dtype == new_dtype: raise AssertionError( f"Did not expect new dtype {new_dtype} to equal self.dtype " @@ -696,14 +688,6 @@ def replace( # go through replace_list values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=to_replace, value=value, inplace=True) - return [blk] - if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that # replacing it is a no-op. @@ -729,7 +713,7 @@ def replace( if value is None or value is NA: blk = self.astype(np.dtype(object)) else: - blk = self.coerce_to_target_dtype(value) + blk = self.coerce_to_target_dtype(value, raise_on_upcast=False) return blk.replace( to_replace=to_replace, value=value, @@ -803,14 +787,6 @@ def replace_list( """ values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=src_list, value=dest_list, inplace=True) - return [blk] - # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -1122,7 +1098,7 @@ def setitem(self, indexer, value) -> Block: casted = np_can_hold_element(values.dtype, value) except LossySetitemError: # current dtype cannot store value, coerce to common dtype - nb = self.coerce_to_target_dtype(value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(value, raise_on_upcast=True) return nb.setitem(indexer, value) else: if self.dtype == _dtype_obj: @@ -1193,7 +1169,7 @@ def putmask(self, mask, new) -> list[Block]: if not is_list_like(new): # using just new[indexer] can't save us the need to cast return self.coerce_to_target_dtype( - new, warn_on_upcast=True + new, raise_on_upcast=True ).putmask(mask, new) else: indexer = mask.nonzero()[0] @@ -1261,7 +1237,7 @@ def where(self, other, cond) -> list[Block]: if self.ndim == 1 or self.shape[0] == 1: # no need to split columns - block = self.coerce_to_target_dtype(other) + block = self.coerce_to_target_dtype(other, raise_on_upcast=False) return block.where(orig_other, cond) else: @@ -1455,7 +1431,7 @@ def shift(self, periods: int, fill_value: Any = None) -> list[Block]: fill_value, ) except LossySetitemError: - nb = self.coerce_to_target_dtype(fill_value) + nb = self.coerce_to_target_dtype(fill_value, raise_on_upcast=False) return nb.shift(periods, fill_value=fill_value) else: @@ -1654,11 +1630,11 @@ def setitem(self, indexer, value): except (ValueError, TypeError): if isinstance(self.dtype, IntervalDtype): # see TestSetitemFloatIntervalWithIntIntervalValues - nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(orig_value, raise_on_upcast=True) return nb.setitem(orig_indexer, orig_value) elif isinstance(self, NDArrayBackedExtensionBlock): - nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(orig_value, raise_on_upcast=True) return nb.setitem(orig_indexer, orig_value) else: @@ -1693,13 +1669,13 @@ def where(self, other, cond) -> list[Block]: if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, IntervalDtype): # TestSetitemFloatIntervalWithIntIntervalValues - blk = self.coerce_to_target_dtype(orig_other) + blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) return blk.where(orig_other, orig_cond) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) - blk = self.coerce_to_target_dtype(orig_other) + blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) return blk.where(orig_other, orig_cond) else: @@ -1754,13 +1730,13 @@ def putmask(self, mask, new) -> list[Block]: if isinstance(self.dtype, IntervalDtype): # Discussion about what we want to support in the general # case GH#39584 - blk = self.coerce_to_target_dtype(orig_new, warn_on_upcast=True) + blk = self.coerce_to_target_dtype(orig_new, raise_on_upcast=True) return blk.putmask(orig_mask, orig_new) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) - blk = self.coerce_to_target_dtype(orig_new, warn_on_upcast=True) + blk = self.coerce_to_target_dtype(orig_new, raise_on_upcast=True) return blk.putmask(orig_mask, orig_new) else: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 23572975a1112..0d149f47fd08c 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -842,7 +842,7 @@ def _list_of_dict_to_arrays( # assure that they are of the base dict class and not of derived # classes - data = [d if type(d) is dict else dict(d) for d in data] # noqa: E721 + data = [d if type(d) is dict else dict(d) for d in data] content = lib.dicts_to_array(data, list(columns)) return content, columns diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 64109f5c1655c..b47d5fe18b9c9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Sequence, ) @@ -8,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, NoReturn, cast, @@ -249,7 +249,7 @@ def blklocs(self) -> npt.NDArray[np.intp]: def make_empty(self, axes=None) -> Self: """return an empty BlockManager with the items axis of len 0""" if axes is None: - axes = [Index([])] + self.axes[1:] + axes = [default_index(0)] + self.axes[1:] # preserve dtype if possible if self.ndim == 1: diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index ef20d4c509732..17d4d38c97f33 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -12,7 +12,6 @@ ) from typing import ( TYPE_CHECKING, - Callable, cast, ) @@ -42,6 +41,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 283acaca2c117..02e7445f1d275 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -29,6 +29,8 @@ ) from pandas.core.dtypes.dtypes import BaseMaskedDtype +from pandas.core.indexes.api import default_index + if TYPE_CHECKING: from pandas._typing import ( DtypeObj, @@ -38,6 +40,7 @@ from pandas import ( DataFrame, + Index, Series, ) else: @@ -199,8 +202,6 @@ def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> No self.columns = columns def compute(self, method: str) -> DataFrame: - from pandas.core.api import Index - n = self.n frame = self.obj columns = self.columns @@ -227,7 +228,7 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: original_index = frame.index cur_frame = frame = frame.reset_index(drop=True) cur_n = n - indexer = Index([], dtype=np.int64) + indexer: Index = default_index(0) for i, column in enumerate(columns): # For each column we apply method to cur_frame[column]. diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 22092551ec882..e775156a6ae2f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -3,8 +3,8 @@ import functools import itertools from typing import ( + TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -48,6 +48,9 @@ notna, ) +if TYPE_CHECKING: + from collections.abc import Callable + bn = import_optional_dependency("bottleneck", errors="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py index d19ac6246e1cd..5cbe1c421e05a 100644 --- a/pandas/core/ops/common.py +++ b/pandas/core/ops/common.py @@ -5,10 +5,7 @@ from __future__ import annotations from functools import wraps -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING from pandas._libs.lib import item_from_zerodim from pandas._libs.missing import is_matching_na @@ -19,6 +16,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import F diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py index c300db8c114c1..395db1617cb63 100644 --- a/pandas/core/ops/invalid.py +++ b/pandas/core/ops/invalid.py @@ -8,13 +8,14 @@ from typing import ( TYPE_CHECKING, Any, - Callable, NoReturn, ) import numpy as np if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( ArrayLike, Scalar, diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ccbe25fdae841..8ee71ea2293e6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -4,7 +4,6 @@ from textwrap import dedent from typing import ( TYPE_CHECKING, - Callable, Literal, cast, final, @@ -92,7 +91,10 @@ ) if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._typing import ( Any, diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 7055201b5a1ee..6836ba3f65691 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -5,9 +5,9 @@ from __future__ import annotations from collections import abc +import types from typing import ( TYPE_CHECKING, - Callable, Literal, cast, overload, @@ -17,10 +17,12 @@ import numpy as np from pandas._libs import lib -from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.common import is_bool +from pandas.core.dtypes.common import ( + is_bool, + is_scalar, +) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -46,6 +48,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -385,291 +388,330 @@ def concat( DeprecationWarning, stacklevel=find_stack_level(), ) + if join == "outer": + intersect = False + elif join == "inner": + intersect = True + else: # pragma: no cover + raise ValueError( + "Only can inner (intersect) or outer (union) join the other axis" + ) - op = _Concatenator( - objs, - axis=axis, - ignore_index=ignore_index, - join=join, - keys=keys, - levels=levels, - names=names, - verify_integrity=verify_integrity, - sort=sort, - ) - - return op.get_result() + if not is_bool(sort): + raise ValueError( + f"The 'sort' keyword only accepts boolean values; {sort} was passed." + ) + sort = bool(sort) + objs, keys, ndims = _clean_keys_and_objs(objs, keys) -class _Concatenator: - """ - Orchestrates a concatenation operation for BlockManagers - """ + # select an object to be our result reference + sample, objs = _get_sample_object(objs, ndims, keys, names, levels, intersect) - sort: bool - - def __init__( - self, - objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], - axis: Axis = 0, - join: str = "outer", - keys: Iterable[Hashable] | None = None, - levels=None, - names: list[HashableT] | None = None, - ignore_index: bool = False, - verify_integrity: bool = False, - sort: bool = False, - ) -> None: - if isinstance(objs, (ABCSeries, ABCDataFrame, str)): - raise TypeError( - "first argument must be an iterable of pandas " - f'objects, you passed an object of type "{type(objs).__name__}"' - ) + # Standardize axis parameter to int + if sample.ndim == 1: + from pandas import DataFrame - if join == "outer": - self.intersect = False - elif join == "inner": - self.intersect = True - else: # pragma: no cover - raise ValueError( - "Only can inner (intersect) or outer (union) join the other axis" - ) + bm_axis = DataFrame._get_axis_number(axis) + is_frame = False + is_series = True + else: + bm_axis = sample._get_axis_number(axis) + is_frame = True + is_series = False - if not is_bool(sort): - raise ValueError( - f"The 'sort' keyword only accepts boolean values; {sort} was passed." - ) - # Incompatible types in assignment (expression has type "Union[bool, bool_]", - # variable has type "bool") - self.sort = sort # type: ignore[assignment] + # Need to flip BlockManager axis in the DataFrame special case + bm_axis = sample._get_block_manager_axis(bm_axis) - self.ignore_index = ignore_index - self.verify_integrity = verify_integrity + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + objs = _sanitize_mixed_ndim(objs, sample, ignore_index, bm_axis) - objs, keys, ndims = _clean_keys_and_objs(objs, keys) + axis = 1 - bm_axis if is_frame else 0 + names = names or getattr(keys, "names", None) + return _get_result( + objs, + is_series, + bm_axis, + ignore_index, + intersect, + sort, + keys, + levels, + verify_integrity, + names, + axis, + ) - # select an object to be our result reference - sample, objs = _get_sample_object( - objs, ndims, keys, names, levels, self.intersect - ) - # Standardize axis parameter to int - if sample.ndim == 1: - from pandas import DataFrame +def _sanitize_mixed_ndim( + objs: list[Series | DataFrame], + sample: Series | DataFrame, + ignore_index: bool, + axis: AxisInt, +) -> list[Series | DataFrame]: + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + + new_objs = [] + + current_column = 0 + max_ndim = sample.ndim + for obj in objs: + ndim = obj.ndim + if ndim == max_ndim: + pass + + elif ndim != max_ndim - 1: + raise ValueError( + "cannot concatenate unaligned mixed dimensional NDFrame objects" + ) - axis = DataFrame._get_axis_number(axis) - self._is_frame = False - self._is_series = True else: - axis = sample._get_axis_number(axis) - self._is_frame = True - self._is_series = False - - # Need to flip BlockManager axis in the DataFrame special case - axis = sample._get_block_manager_axis(axis) - - # if we have mixed ndims, then convert to highest ndim - # creating column numbers as needed - if len(ndims) > 1: - objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) - - self.objs = objs - - # note: this is the BlockManager axis (since DataFrame is transposed) - self.bm_axis = axis - self.axis = 1 - self.bm_axis if self._is_frame else 0 - self.keys = keys - self.names = names or getattr(keys, "names", None) - self.levels = levels - - def _sanitize_mixed_ndim( - self, - objs: list[Series | DataFrame], - sample: Series | DataFrame, - ignore_index: bool, - axis: AxisInt, - ) -> list[Series | DataFrame]: - # if we have mixed ndims, then convert to highest ndim - # creating column numbers as needed - - new_objs = [] - - current_column = 0 - max_ndim = sample.ndim - for obj in objs: - ndim = obj.ndim - if ndim == max_ndim: - pass - - elif ndim != max_ndim - 1: - raise ValueError( - "cannot concatenate unaligned mixed dimensional NDFrame objects" - ) - - else: - name = getattr(obj, "name", None) - if ignore_index or name is None: - if axis == 1: - # doing a row-wise concatenation so need everything - # to line up - name = 0 - else: - # doing a column-wise concatenation so need series - # to have unique names - name = current_column - current_column += 1 - obj = sample._constructor(obj, copy=False) - if isinstance(obj, ABCDataFrame): - obj.columns = range(name, name + 1, 1) + name = getattr(obj, "name", None) + if ignore_index or name is None: + if axis == 1: + # doing a row-wise concatenation so need everything + # to line up + name = 0 else: - obj = sample._constructor({name: obj}, copy=False) - - new_objs.append(obj) - - return new_objs + # doing a column-wise concatenation so need series + # to have unique names + name = current_column + current_column += 1 + obj = sample._constructor(obj, copy=False) + if isinstance(obj, ABCDataFrame): + obj.columns = range(name, name + 1, 1) + else: + obj = sample._constructor({name: obj}, copy=False) - def get_result(self): - cons: Callable[..., DataFrame | Series] - sample: DataFrame | Series + new_objs.append(obj) - # series only - if self._is_series: - sample = cast("Series", self.objs[0]) + return new_objs - # stack blocks - if self.bm_axis == 0: - name = com.consensus_name_attr(self.objs) - cons = sample._constructor - arrs = [ser._values for ser in self.objs] +def _get_result( + objs: list[Series | DataFrame], + is_series: bool, + bm_axis: AxisInt, + ignore_index: bool, + intersect: bool, + sort: bool, + keys: Iterable[Hashable] | None, + levels, + verify_integrity: bool, + names: list[HashableT] | None, + axis: AxisInt, +): + cons: Callable[..., DataFrame | Series] + sample: DataFrame | Series - res = concat_compat(arrs, axis=0) + # series only + if is_series: + sample = cast("Series", objs[0]) - new_index: Index - if self.ignore_index: - # We can avoid surprisingly-expensive _get_concat_axis - new_index = default_index(len(res)) - else: - new_index = self.new_axes[0] + # stack blocks + if bm_axis == 0: + name = com.consensus_name_attr(objs) + cons = sample._constructor - mgr = type(sample._mgr).from_array(res, index=new_index) + arrs = [ser._values for ser in objs] - result = sample._constructor_from_mgr(mgr, axes=mgr.axes) - result._name = name - return result.__finalize__(self, method="concat") + res = concat_compat(arrs, axis=0) - # combine as columns in a frame + if ignore_index: + new_index: Index = default_index(len(res)) else: - data = dict(enumerate(self.objs)) + new_index = _get_concat_axis_series( + objs, + ignore_index, + bm_axis, + keys, + levels, + verify_integrity, + names, + ) - # GH28330 Preserves subclassed objects through concat - cons = sample._constructor_expanddim + mgr = type(sample._mgr).from_array(res, index=new_index) - index, columns = self.new_axes - df = cons(data, index=index, copy=False) - df.columns = columns - return df.__finalize__(self, method="concat") + result = sample._constructor_from_mgr(mgr, axes=mgr.axes) + result._name = name + return result.__finalize__( + types.SimpleNamespace(objs=objs), method="concat" + ) - # combine block managers + # combine as columns in a frame else: - sample = cast("DataFrame", self.objs[0]) - - mgrs_indexers = [] - for obj in self.objs: - indexers = {} - for ax, new_labels in enumerate(self.new_axes): - # ::-1 to convert BlockManager ax to DataFrame ax - if ax == self.bm_axis: - # Suppress reindexing on concat axis - continue - - # 1-ax to convert BlockManager axis to DataFrame axis - obj_labels = obj.axes[1 - ax] - if not new_labels.equals(obj_labels): - indexers[ax] = obj_labels.get_indexer(new_labels) - - mgrs_indexers.append((obj._mgr, indexers)) - - new_data = concatenate_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=False - ) + data = dict(enumerate(objs)) - out = sample._constructor_from_mgr(new_data, axes=new_data.axes) - return out.__finalize__(self, method="concat") + # GH28330 Preserves subclassed objects through concat + cons = sample._constructor_expanddim - @cache_readonly - def new_axes(self) -> list[Index]: - if self._is_series and self.bm_axis == 1: - ndim = 2 - else: - ndim = self.objs[0].ndim - return [ - self._get_concat_axis - if i == self.bm_axis - else get_objs_combined_axis( - self.objs, - axis=self.objs[0]._get_block_manager_axis(i), - intersect=self.intersect, - sort=self.sort, + index = get_objs_combined_axis( + objs, + axis=objs[0]._get_block_manager_axis(0), + intersect=intersect, + sort=sort, ) - for i in range(ndim) - ] - - @cache_readonly - def _get_concat_axis(self) -> Index: - """ - Return index to be used along concatenation axis. - """ - if self._is_series: - if self.bm_axis == 0: - indexes = [x.index for x in self.objs] - elif self.ignore_index: - idx = default_index(len(self.objs)) - return idx - elif self.keys is None: - names: list[Hashable] = [None] * len(self.objs) - num = 0 - has_names = False - for i, x in enumerate(self.objs): - if x.ndim != 1: - raise TypeError( - f"Cannot concatenate type 'Series' with " - f"object of type '{type(x).__name__}'" - ) - if x.name is not None: - names[i] = x.name - has_names = True - else: - names[i] = num - num += 1 - if has_names: - return Index(names) - else: - return default_index(len(self.objs)) - else: - return ensure_index(self.keys).set_names(self.names) - else: - indexes = [x.axes[self.axis] for x in self.objs] + columns = _get_concat_axis_series( + objs, ignore_index, bm_axis, keys, levels, verify_integrity, names + ) + df = cons(data, index=index, copy=False) + df.columns = columns + return df.__finalize__(types.SimpleNamespace(objs=objs), method="concat") + + # combine block managers + else: + sample = cast("DataFrame", objs[0]) + + mgrs_indexers = [] + result_axes = new_axes( + objs, + bm_axis, + intersect, + sort, + keys, + names, + axis, + levels, + verify_integrity, + ignore_index, + ) + for obj in objs: + indexers = {} + for ax, new_labels in enumerate(result_axes): + # ::-1 to convert BlockManager ax to DataFrame ax + if ax == bm_axis: + # Suppress reindexing on concat axis + continue + + # 1-ax to convert BlockManager axis to DataFrame axis + obj_labels = obj.axes[1 - ax] + if not new_labels.equals(obj_labels): + indexers[ax] = obj_labels.get_indexer(new_labels) + + mgrs_indexers.append((obj._mgr, indexers)) + + new_data = concatenate_managers( + mgrs_indexers, result_axes, concat_axis=bm_axis, copy=False + ) + + out = sample._constructor_from_mgr(new_data, axes=new_data.axes) + return out.__finalize__(types.SimpleNamespace(objs=objs), method="concat") - if self.ignore_index: - idx = default_index(sum(len(i) for i in indexes)) - return idx - if self.keys is None: - if self.levels is not None: +def new_axes( + objs: list[Series | DataFrame], + bm_axis: AxisInt, + intersect: bool, + sort: bool, + keys: Iterable[Hashable] | None, + names: list[HashableT] | None, + axis: AxisInt, + levels, + verify_integrity: bool, + ignore_index: bool, +) -> list[Index]: + """Return the new [index, column] result for concat.""" + return [ + _get_concat_axis_dataframe( + objs, + axis, + ignore_index, + keys, + names, + levels, + verify_integrity, + ) + if i == bm_axis + else get_objs_combined_axis( + objs, + axis=objs[0]._get_block_manager_axis(i), + intersect=intersect, + sort=sort, + ) + for i in range(2) + ] + + +def _get_concat_axis_series( + objs: list[Series | DataFrame], + ignore_index: bool, + bm_axis: AxisInt, + keys: Iterable[Hashable] | None, + levels, + verify_integrity: bool, + names: list[HashableT] | None, +) -> Index: + """Return result concat axis when concatenating Series objects.""" + if ignore_index: + return default_index(len(objs)) + elif bm_axis == 0: + indexes = [x.index for x in objs] + if keys is None: + if levels is not None: raise ValueError("levels supported only when keys is not None") concat_axis = _concat_indexes(indexes) else: - concat_axis = _make_concat_multiindex( - indexes, self.keys, self.levels, self.names - ) + concat_axis = _make_concat_multiindex(indexes, keys, levels, names) + if verify_integrity and not concat_axis.is_unique: + overlap = concat_axis[concat_axis.duplicated()].unique() + raise ValueError(f"Indexes have overlapping values: {overlap}") + return concat_axis + elif keys is None: + result_names: list[Hashable] = [None] * len(objs) + num = 0 + has_names = False + for i, x in enumerate(objs): + if x.ndim != 1: + raise TypeError( + f"Cannot concatenate type 'Series' with " + f"object of type '{type(x).__name__}'" + ) + if x.name is not None: + result_names[i] = x.name + has_names = True + else: + result_names[i] = num + num += 1 + if has_names: + return Index(result_names) + else: + return default_index(len(objs)) + else: + return ensure_index(keys).set_names(names) # type: ignore[arg-type] - if self.verify_integrity: - if not concat_axis.is_unique: - overlap = concat_axis[concat_axis.duplicated()].unique() - raise ValueError(f"Indexes have overlapping values: {overlap}") - return concat_axis +def _get_concat_axis_dataframe( + objs: list[Series | DataFrame], + axis: AxisInt, + ignore_index: bool, + keys: Iterable[Hashable] | None, + names: list[HashableT] | None, + levels, + verify_integrity: bool, +) -> Index: + """Return result concat axis when concatenating DataFrame objects.""" + indexes_gen = (x.axes[axis] for x in objs) + + if ignore_index: + return default_index(sum(len(i) for i in indexes_gen)) + else: + indexes = list(indexes_gen) + + if keys is None: + if levels is not None: + raise ValueError("levels supported only when keys is not None") + concat_axis = _concat_indexes(indexes) + else: + concat_axis = _make_concat_multiindex(indexes, keys, levels, names) + + if verify_integrity and not concat_axis.is_unique: + overlap = concat_axis[concat_axis.duplicated()].unique() + raise ValueError(f"Indexes have overlapping values: {overlap}") + + return concat_axis def _clean_keys_and_objs( @@ -680,7 +722,7 @@ def _clean_keys_and_objs( Returns ------- clean_objs : list[Series | DataFrame] - LIst of DataFrame and Series with Nones removed. + List of DataFrame and Series with Nones removed. keys : Index | None None if keys was None Index if objs was a Mapping or keys was not None. Filtered where objs was None. @@ -690,28 +732,33 @@ def _clean_keys_and_objs( if isinstance(objs, abc.Mapping): if keys is None: keys = objs.keys() - objs_list = [objs[k] for k in keys] - else: - objs_list = list(objs) + objs = [objs[k] for k in keys] + elif isinstance(objs, (ABCSeries, ABCDataFrame)) or is_scalar(objs): + raise TypeError( + "first argument must be an iterable of pandas " + f'objects, you passed an object of type "{type(objs).__name__}"' + ) + elif not isinstance(objs, abc.Sized): + objs = list(objs) - if len(objs_list) == 0: + if len(objs) == 0: raise ValueError("No objects to concatenate") if keys is not None: if not isinstance(keys, Index): keys = Index(keys) - if len(keys) != len(objs_list): + if len(keys) != len(objs): # GH#43485 raise ValueError( f"The length of the keys ({len(keys)}) must match " - f"the length of the objects to concatenate ({len(objs_list)})" + f"the length of the objects to concatenate ({len(objs)})" ) # GH#1649 key_indices = [] clean_objs = [] ndims = set() - for i, obj in enumerate(objs_list): + for i, obj in enumerate(objs): if obj is None: continue elif isinstance(obj, (ABCSeries, ABCDataFrame)): diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 294de2cf2fe1d..bfd8e3ccd2f7c 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -15,7 +15,6 @@ import pandas.core.algorithms as algos from pandas.core.indexes.api import MultiIndex from pandas.core.reshape.concat import concat -from pandas.core.reshape.util import tile_compat from pandas.core.tools.numeric import to_numeric if TYPE_CHECKING: @@ -202,9 +201,9 @@ def melt( if value_vars_was_not_none: frame = frame.iloc[:, algos.unique(idx)] else: - frame = frame.copy() + frame = frame.copy(deep=False) else: - frame = frame.copy() + frame = frame.copy(deep=False) if col_level is not None: # allow list or other? # frame is a copy @@ -266,7 +265,8 @@ def melt( result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: - result.index = tile_compat(frame.index, num_cols_adjusted) + taker = np.tile(np.arange(len(frame)), num_cols_adjusted) + result.index = frame.index.take(taker) return result diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ddf6bd3c70988..2ce77ac19b9c5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -39,11 +39,7 @@ npt, ) from pandas.errors import MergeError -from pandas.util._decorators import ( - Appender, - Substitution, - cache_readonly, -) +from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ExtensionDtype @@ -95,7 +91,6 @@ ensure_wrapped_if_datetimelike, extract_array, ) -from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index from pandas.core.sorting import ( get_group_index, @@ -133,8 +128,6 @@ _known = (np.ndarray, ExtensionArray, Index, ABCSeries) -@Substitution("\nleft : DataFrame or named Series") -@Appender(_merge_doc, indents=0) def merge( left: DataFrame | Series, right: DataFrame | Series, @@ -150,6 +143,210 @@ def merge( indicator: str | bool = False, validate: str | None = None, ) -> DataFrame: + """ + Merge DataFrame or named Series objects with a database-style join. + + A named Series object is treated as a DataFrame with a single named column. + + The join is done on columns or indexes. If joining columns on + columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes + on indexes or indexes on a column or columns, the index will be passed on. + When performing a cross merge, no column specifications to merge on are + allowed. + + .. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + + Parameters + ---------- + left : DataFrame or named Series + First pandas object to merge. + right : DataFrame or named Series + Second pandas object to merge. + how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. + left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. + right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. + left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. + right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. + sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). + suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. + copy : bool, default False + If False, avoid copy if possible. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 + indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. + + validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + Returns + ------- + DataFrame + A DataFrame of the two merged objects. + + See Also + -------- + merge_ordered : Merge with optional filling/interpolation. + merge_asof : Merge on nearest keys. + DataFrame.join : Similar method using indices. + + Examples + -------- + >>> df1 = pd.DataFrame( + ... {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]} + ... ) + >>> df2 = pd.DataFrame( + ... {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]} + ... ) + >>> df1 + lkey value + 0 foo 1 + 1 bar 2 + 2 baz 3 + 3 foo 5 + >>> df2 + rkey value + 0 foo 5 + 1 bar 6 + 2 baz 7 + 3 foo 8 + + Merge df1 and df2 on the lkey and rkey columns. The value columns have + the default suffixes, _x and _y, appended. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey") + lkey value_x rkey value_y + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + Merge DataFrames df1 and df2 with specified left and right suffixes + appended to any overlapping columns. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey", suffixes=("_left", "_right")) + lkey value_left rkey value_right + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + Merge DataFrames df1 and df2, but raise an exception if the DataFrames have + any overlapping columns. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey", suffixes=(False, False)) + Traceback (most recent call last): + ... + ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') + + >>> df1 = pd.DataFrame({"a": ["foo", "bar"], "b": [1, 2]}) + >>> df2 = pd.DataFrame({"a": ["foo", "baz"], "c": [3, 4]}) + >>> df1 + a b + 0 foo 1 + 1 bar 2 + >>> df2 + a c + 0 foo 3 + 1 baz 4 + + >>> df1.merge(df2, how="inner", on="a") + a b c + 0 foo 1 3 + + >>> df1.merge(df2, how="left", on="a") + a b c + 0 foo 1 3.0 + 1 bar 2 NaN + + >>> df1 = pd.DataFrame({"left": ["foo", "bar"]}) + >>> df2 = pd.DataFrame({"right": [7, 8]}) + >>> df1 + left + 0 foo + 1 bar + >>> df2 + right + 0 7 + 1 8 + + >>> df1.merge(df2, how="cross") + left right + 0 foo 7 + 1 foo 8 + 2 bar 7 + 3 bar 8 + """ left_df = _validate_operand(left) left._check_copy_deprecation(copy) right_df = _validate_operand(right) @@ -476,7 +673,9 @@ def merge_asof( Parameters ---------- left : DataFrame or named Series + First pandas object to merge. right : DataFrame or named Series + Second pandas object to merge. on : label Field name to join on. Must be found in both DataFrames. The data MUST be ordered. Furthermore this must be a numeric column, @@ -515,6 +714,7 @@ def merge_asof( Returns ------- DataFrame + A DataFrame of the two merged objects. See Also -------- diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 8c2c2053b0554..0886aad310034 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -3,7 +3,6 @@ import itertools from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) @@ -32,11 +31,13 @@ get_objs_combined_axis, ) from pandas.core.reshape.concat import concat -from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._typing import ( AggFuncType, @@ -90,7 +91,7 @@ def pivot_table( hierarchical columns whose top level are the function names (inferred from the function objects themselves). If a dict is passed, the key is column to aggregate and the value is - function or list of functions. If ``margin=True``, aggfunc will be + function or list of functions. If ``margins=True``, aggfunc will be used to calculate the partial aggregates. fill_value : scalar, default None Value to replace missing values with (in the resulting pivot table, @@ -356,15 +357,11 @@ def __internal_pivot_table( if not dropna: if isinstance(table.index, MultiIndex): - m = MultiIndex.from_arrays( - cartesian_product(table.index.levels), names=table.index.names - ) + m = MultiIndex.from_product(table.index.levels, names=table.index.names) table = table.reindex(m, axis=0, fill_value=fill_value) if isinstance(table.columns, MultiIndex): - m = MultiIndex.from_arrays( - cartesian_product(table.columns.levels), names=table.columns.names - ) + m = MultiIndex.from_product(table.columns.levels, names=table.columns.names) table = table.reindex(m, axis=1, fill_value=fill_value) if sort is True and isinstance(table, ABCDataFrame): @@ -555,8 +552,6 @@ def _all_key(key): piece = piece.T all_key = _all_key(key) - # we are going to mutate this, so need to copy! - piece = piece.copy() piece[all_key] = margin[key] table_pieces.append(piece) @@ -840,11 +835,11 @@ def pivot( # If columns is None we will create a MultiIndex level with None as name # which might cause duplicated names because None is the default for # level names - data = data.copy(deep=False) - data.index = data.index.copy() - data.index.names = [ - name if name is not None else lib.no_default for name in data.index.names - ] + if any(name is None for name in data.index.names): + data = data.copy(deep=False) + data.index.names = [ + name if name is not None else lib.no_default for name in data.index.names + ] indexed: DataFrame | Series if values is lib.no_default: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 5426c72a356d6..9b7b768fe7adb 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -42,7 +42,7 @@ from pandas.core.indexes.api import ( Index, MultiIndex, - RangeIndex, + default_index, ) from pandas.core.reshape.concat import concat from pandas.core.series import Series @@ -288,21 +288,19 @@ def get_new_values(self, values, fill_value=None): dtype = values.dtype - # if our mask is all True, then we can use our existing dtype - if mask_all: - dtype = values.dtype - new_values = np.empty(result_shape, dtype=dtype) - else: - if isinstance(dtype, ExtensionDtype): - # GH#41875 - # We are assuming that fill_value can be held by this dtype, - # unlike the non-EA case that promotes. - cls = dtype.construct_array_type() - new_values = cls._empty(result_shape, dtype=dtype) + if isinstance(dtype, ExtensionDtype): + # GH#41875 + # We are assuming that fill_value can be held by this dtype, + # unlike the non-EA case that promotes. + cls = dtype.construct_array_type() + new_values = cls._empty(result_shape, dtype=dtype) + if not mask_all: new_values[:] = fill_value - else: + else: + if not mask_all: dtype, fill_value = maybe_promote(dtype, fill_value) - new_values = np.empty(result_shape, dtype=dtype) + new_values = np.empty(result_shape, dtype=dtype) + if not mask_all: new_values.fill(fill_value) name = dtype.name @@ -461,7 +459,7 @@ def _unstack_multiple( ) if isinstance(data, Series): - dummy = data.copy() + dummy = data.copy(deep=False) dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort) @@ -842,7 +840,7 @@ def _convert_level_number(level_num: int, columns: Index): [x._values.astype(dtype, copy=False) for _, x in subset.items()] ) N, K = subset.shape - idx = np.arange(N * K).reshape(K, N).T.ravel() + idx = np.arange(N * K).reshape(K, N).T.reshape(-1) value_slice = value_slice.take(idx) else: value_slice = subset.values @@ -924,7 +922,7 @@ def _reorder_for_extension_array_stack( # idx is an indexer like # [c0r0, c1r0, c2r0, ..., # c0r1, c1r1, c2r1, ...] - idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() + idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.reshape(-1) return arr.take(idx) @@ -1025,7 +1023,7 @@ def stack_reshape( buf = [] for idx in stack_cols.unique(): if len(frame.columns) == 1: - data = frame.copy() + data = frame.copy(deep=False) else: if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple): # GH#57750 - if the frame is an Index with tuples, .loc below will fail @@ -1047,7 +1045,7 @@ def stack_reshape( if data.ndim == 1: data.name = 0 else: - data.columns = RangeIndex(len(data.columns)) + data.columns = default_index(len(data.columns)) buf.append(data) if len(buf) > 0 and not frame.empty: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index d780433386395..0052bcfe09147 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, ) @@ -44,6 +43,8 @@ from pandas.core.arrays.datetimelike import dtype_to_unit if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( DtypeObj, IntervalLeftRight, diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py deleted file mode 100644 index 0f1fbc662e1a6..0000000000000 --- a/pandas/core/reshape/util.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import numpy as np - -from pandas.core.dtypes.common import is_list_like - -if TYPE_CHECKING: - from pandas._typing import NumpyIndexT - - -def cartesian_product(X) -> list[np.ndarray]: - """ - Numpy version of itertools.product. - Sometimes faster (for large inputs)... - - Parameters - ---------- - X : list-like of list-likes - - Returns - ------- - product : list of ndarrays - - Examples - -------- - >>> cartesian_product([list("ABC"), [1, 2]]) - [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype=' NumpyIndexT: - """ - Index compat for np.tile. - - Notes - ----- - Does not support multi-dimensional `num`. - """ - if isinstance(arr, np.ndarray): - return np.tile(arr, num) - - # Otherwise we have an Index - taker = np.tile(np.arange(len(arr)), num) - return arr.take(taker) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3d1bd8ebb03cb..184c774d04a47 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5,6 +5,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -17,7 +18,6 @@ IO, TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -49,7 +49,6 @@ deprecate_nonkeyword_arguments, doc, ) -from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, @@ -1426,7 +1425,7 @@ def to_string( ) -> None: ... @deprecate_nonkeyword_arguments( - version="3.0.0", allowed_args=["self", "buf"], name="to_string" + version="4.0", allowed_args=["self", "buf"], name="to_string" ) def to_string( self, @@ -1584,7 +1583,7 @@ def to_markdown( ), ) @deprecate_nonkeyword_arguments( - version="3.0.0", allowed_args=["self", "buf"], name="to_markdown" + version="4.0", allowed_args=["self", "buf"], name="to_markdown" ) def to_markdown( self, @@ -3722,25 +3721,7 @@ def argsort( # GH#54257 We allow -1 here so that np.argsort(series) works self._get_axis_number(axis) - values = self._values - mask = isna(values) - - if mask.any(): - # TODO(3.0): once this deprecation is enforced we can call - # self.array.argsort directly, which will close GH#43840 and - # GH#12694 - warnings.warn( - "The behavior of Series.argsort in the presence of NA values is " - "deprecated. In a future version, NA values will be ordered " - "last instead of set to -1.", - FutureWarning, - stacklevel=find_stack_level(), - ) - result = np.full(len(self), -1, dtype=np.intp) - notmask = ~mask - result[notmask] = np.argsort(values[notmask], kind=kind) - else: - result = np.argsort(values, kind=kind) + result = self.array.argsort(kind=kind) res = self._constructor( result, index=self.index, name=self.name, dtype=np.intp, copy=False @@ -5020,7 +5001,8 @@ def pop(self, item: Hashable) -> Any: Returns ------- - Value that is popped from series. + scalar + Value that is popped from series. Examples -------- @@ -6050,8 +6032,69 @@ def lt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, operator.lt, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("ge", "series")) def ge(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Greater than or equal to of series and other, \ + element-wise (binary operator `ge`). + + Equivalent to ``series >= other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + The second operand in this operation. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.gt : Greater than comparison, element-wise. + Series.le : Less than or equal to comparison, element-wise. + Series.lt : Less than comparison, element-wise. + Series.eq : Equal to comparison, element-wise. + Series.ne : Not equal to comparison, element-wise. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan, 1], index=["a", "b", "c", "d", "e"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + e 1.0 + dtype: float64 + >>> b = pd.Series([0, 1, 2, np.nan, 1], index=["a", "b", "c", "d", "f"]) + >>> b + a 0.0 + b 1.0 + c 2.0 + d NaN + f 1.0 + dtype: float64 + >>> a.ge(b, fill_value=0) + a True + b True + c False + d False + e True + f False + dtype: bool + """ return self._flex_method( other, operator.ge, level=level, fill_value=fill_value, axis=axis ) @@ -6469,7 +6512,7 @@ def any( # type: ignore[override] filter_type="bool", ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="all") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="all") @Appender(make_doc("all", ndim=1)) def all( self, @@ -6489,7 +6532,7 @@ def all( filter_type="bool", ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="min") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="min") def min( self, axis: Axis | None = 0, @@ -6524,7 +6567,7 @@ def min( Returns ------- scalar or Series (if level specified) - The maximum of the values in the Series. + The minimum of the values in the Series. See Also -------- @@ -6560,7 +6603,7 @@ def min( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="max") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="max") def max( self, axis: Axis | None = 0, @@ -6631,7 +6674,7 @@ def max( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sum") def sum( self, axis: Axis | None = None, @@ -6673,7 +6716,7 @@ def sum( Returns ------- scalar or Series (if level specified) - Median of the values for the requested axis. + Sum of the values for the requested axis. See Also -------- @@ -6732,7 +6775,7 @@ def sum( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="prod") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="prod") @doc(make_doc("prod", ndim=1)) def prod( self, @@ -6751,7 +6794,7 @@ def prod( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="mean") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="mean") def mean( self, axis: Axis | None = 0, @@ -6783,7 +6826,7 @@ def mean( Returns ------- scalar or Series (if level specified) - Median of the values for the requested axis. + Mean of the values for the requested axis. See Also -------- @@ -6805,7 +6848,7 @@ def mean( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="median") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="median") def median( self, axis: Axis | None = 0, @@ -6886,7 +6929,7 @@ def median( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sem") @doc(make_doc("sem", ndim=1)) def sem( self, @@ -6905,7 +6948,7 @@ def sem( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="var") def var( self, axis: Axis | None = None, @@ -6992,7 +7035,7 @@ def var( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="std") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="std") @doc(make_doc("std", ndim=1)) def std( self, @@ -7011,7 +7054,7 @@ def std( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="skew") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="skew") @doc(make_doc("skew", ndim=1)) def skew( self, @@ -7024,7 +7067,7 @@ def skew( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="kurt") def kurt( self, axis: Axis | None = 0, diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 4fba243f73536..0d8f42694ccb4 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -5,7 +5,6 @@ import itertools from typing import ( TYPE_CHECKING, - Callable, cast, ) @@ -32,6 +31,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7494a43caf004..dd9276179cf4d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -5,7 +5,6 @@ import re from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) @@ -50,6 +49,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, ) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index c1f94abff428a..1281a03e297f9 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -3,14 +3,16 @@ import abc from typing import ( TYPE_CHECKING, - Callable, Literal, ) import numpy as np if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) import re from pandas._typing import ( diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index bdcf55e61d2d1..290a28ab60ae1 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -5,7 +5,6 @@ import textwrap from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) @@ -22,7 +21,10 @@ from pandas.core.strings.base import BaseStringArrayMethods if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( NpDtype, @@ -457,16 +459,7 @@ def _str_rstrip(self, to_strip=None): return self._str_map(lambda x: x.rstrip(to_strip)) def _str_removeprefix(self, prefix: str): - # outstanding question on whether to use native methods for users on Python 3.9+ - # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770, - # in which case we could do return self._str_map(str.removeprefix) - - def removeprefix(text: str) -> str: - if text.startswith(prefix): - return text[len(prefix) :] - return text - - return self._str_map(removeprefix) + return self._str_map(lambda x: x.removeprefix(prefix)) def _str_removesuffix(self, suffix: str): return self._str_map(lambda x: x.removesuffix(suffix)) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c116ef015ae16..0e91bfa99e887 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -6,7 +6,6 @@ from itertools import islice from typing import ( TYPE_CHECKING, - Callable, TypedDict, Union, cast, @@ -77,7 +76,10 @@ from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._libs.tslibs.nattype import NaTType from pandas._libs.tslibs.timedeltas import UnitChoices @@ -127,7 +129,7 @@ class FulldatetimeDict(YearMonthDayDict, total=False): def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: # Try to guess the format based on the first non-NaN element, return None if can't if (first_non_null := tslib.first_non_null(arr)) != -1: - if type(first_non_nan_element := arr[first_non_null]) is str: # noqa: E721 + if type(first_non_nan_element := arr[first_non_null]) is str: # GH#32264 np.str_ object guessed_format = guess_datetime_format( first_non_nan_element, dayfirst=dayfirst diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 3d28a73df99d1..3d406d3bfb115 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -64,6 +64,7 @@ def to_numeric( ---------- arg : scalar, list, tuple, 1-d array, or Series Argument to be converted. + errors : {'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. @@ -88,14 +89,15 @@ def to_numeric( the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"numpy_nullable"``: returns with nullable-dtype-backed + * ``"pyarrow"``: returns with pyarrow-backed nullable :class:`ArrowDtype` .. versionadded:: 2.0 diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 296168fe7e725..8d82a5c213910 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -170,7 +170,7 @@ def to_timedelta( TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03', '0 days 00:00:04'], dtype='timedelta64[ns]', freq=None) - >>> pd.to_timedelta(np.arange(5), unit="d") + >>> pd.to_timedelta(np.arange(5), unit="D") TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index a6079785e7475..de024f612516b 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -2,17 +2,18 @@ from __future__ import annotations +import inspect import types -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import numpy as np from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError +if TYPE_CHECKING: + from collections.abc import Callable + GLOBAL_USE_NUMBA: bool = False @@ -54,10 +55,15 @@ def get_jit_arguments( engine_kwargs = {} nopython = engine_kwargs.get("nopython", True) - if kwargs and nopython: + if kwargs: + # Note: in case numba supports keyword-only arguments in + # a future version, we should remove this check. But this + # seems unlikely to happen soon. + raise NumbaUtilError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" + "numba does not support keyword-only arguments" + "https://github.com/numba/numba/issues/2916, " + "https://github.com/numba/numba/issues/6846" ) nogil = engine_kwargs.get("nogil", False) parallel = engine_kwargs.get("parallel", False) @@ -97,3 +103,47 @@ def jit_user_function(func: Callable) -> Callable: numba_func = numba.extending.register_jitable(func) return numba_func + + +_sentinel = object() + + +def prepare_function_arguments( + func: Callable, args: tuple, kwargs: dict +) -> tuple[tuple, dict]: + """ + Prepare arguments for jitted function. As numba functions do not support kwargs, + we try to move kwargs into args if possible. + + Parameters + ---------- + func : function + user defined function + args : tuple + user input positional arguments + kwargs : dict + user input keyword arguments + + Returns + ------- + tuple[tuple, dict] + args, kwargs + + """ + if not kwargs: + return args, kwargs + + # the udf should have this pattern: def udf(value, *args, **kwargs):... + signature = inspect.signature(func) + arguments = signature.bind(_sentinel, *args, **kwargs) + arguments.apply_defaults() + # Ref: https://peps.python.org/pep-0362/ + # Arguments which could be passed as part of either *args or **kwargs + # will be included only in the BoundArguments.args attribute. + args = arguments.args + kwargs = arguments.kwargs + + assert args[0] is _sentinel + args = args[1:] + + return args, kwargs diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index b2855ff1f4048..43a3c03b6cef9 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -134,8 +134,10 @@ class ExponentialMovingWindow(BaseWindow): Provide exponentially weighted (EW) calculations. Exactly one of ``com``, ``span``, ``halflife``, or ``alpha`` must be - provided if ``times`` is not provided. If ``times`` is provided, + provided if ``times`` is not provided. If ``times`` is provided and ``adjust=True``, ``halflife`` and one of ``com``, ``span`` or ``alpha`` may be provided. + If ``times`` is provided and ``adjust=False``, ``halflife`` must be the only + provided decay-specification parameter. Parameters ---------- @@ -358,8 +360,6 @@ def __init__( self.ignore_na = ignore_na self.times = times if self.times is not None: - if not self.adjust: - raise NotImplementedError("times is not supported with adjust=False.") times_dtype = getattr(self.times, "dtype", None) if not ( is_datetime64_dtype(times_dtype) @@ -376,6 +376,11 @@ def __init__( # Halflife is no longer applicable when calculating COM # But allow COM to still be calculated if the user passes other decay args if common.count_not_none(self.com, self.span, self.alpha) > 0: + if not self.adjust: + raise NotImplementedError( + "None of com, span, or alpha can be specified if " + "times is provided and adjust=False" + ) self._com = get_center_of_mass(self.com, self.span, None, self.alpha) else: self._com = 1.0 diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index f14954cd9a4b0..d0c8a2e67b6ca 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, ) @@ -32,6 +31,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( QuantileInterpolation, WindowingRankType, diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 824cf936b8185..171d3bc1d1c35 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -14,6 +13,8 @@ from pandas.core.util.numba_ import jit_user_function if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import Scalar @@ -148,6 +149,9 @@ def ewm( # note that len(deltas) = len(vals) - 1 and deltas[i] # is to be used in conjunction with vals[i+1] old_wt *= old_wt_factor ** deltas[start + j - 1] + if not adjust and com == 1: + # update in case of irregular-interval time series + new_wt = 1.0 - old_wt else: weighted = old_wt_factor * weighted if is_observation: @@ -323,6 +327,9 @@ def ewm_table( # note that len(deltas) = len(vals) - 1 and deltas[i] # is to be used in conjunction with vals[i+1] old_wt[j] *= old_wt_factor ** deltas[i - 1] + if not adjust and com == 1: + # update in case of irregular-interval time series + new_wt = 1.0 - old_wt[j] else: weighted[j] = old_wt_factor * weighted[j] if is_observations[j]: diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 2243d8dd1a613..16aa6d7e56a1c 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -13,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, ) @@ -90,6 +89,7 @@ ) if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Hashable, Iterator, diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 3b2ae5daffdba..cb0f89945e440 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,11 +1,14 @@ from __future__ import annotations -from typing import Callable +from typing import TYPE_CHECKING from pandas.compat._optional import import_optional_dependency import pandas as pd +if TYPE_CHECKING: + from collections.abc import Callable + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") diff --git a/pandas/io/common.py b/pandas/io/common.py index 4507a7d08c8ba..a76f0cf6dd34d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -55,10 +55,6 @@ BaseBuffer, ReadCsvBuffer, ) -from pandas.compat import ( - get_bz2_file, - get_lzma_file, -) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -784,9 +780,11 @@ def get_handle( # BZ Compression elif compression == "bz2": + import bz2 + # Overload of "BZ2File" to handle pickle protocol 5 # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" - handle = get_bz2_file()( # type: ignore[call-overload] + handle = bz2.BZ2File( # type: ignore[call-overload] handle, mode=ioargs.mode, **compression_args, @@ -849,7 +847,9 @@ def get_handle( # error: Argument 1 to "LZMAFile" has incompatible type "Union[str, # BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str], # PathLike[bytes]], IO[bytes]], None]" - handle = get_lzma_file()( + import lzma + + handle = lzma.LZMAFile( handle, # type: ignore[arg-type] ioargs.mode, **compression_args, diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 1eb22d4ee9de7..f83f9cb1c8d74 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -14,7 +15,6 @@ IO, TYPE_CHECKING, Any, - Callable, Generic, Literal, TypeVar, @@ -957,7 +957,7 @@ class ExcelWriter(Generic[_WorkbookT]): * `xlsxwriter `__ for xlsx files if xlsxwriter is installed otherwise `openpyxl `__ - * `odswriter `__ for ods files + * `odf `__ for ods files See :meth:`DataFrame.to_excel` for typical usage. @@ -1004,7 +1004,7 @@ class ExcelWriter(Generic[_WorkbookT]): * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)`` * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)`` * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)`` - * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` + * odf: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` .. versionadded:: 1.3.0 diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index f879f16aa5dc8..e7c5d518abaee 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, MutableMapping, @@ -9,7 +10,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, TypeVar, overload, diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index d3d0da6f562a7..0af04526ea96d 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -5,10 +5,7 @@ from __future__ import annotations import re -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings from pandas.errors import CSSWarning @@ -16,6 +13,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Iterable, Iterator, diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index b6c6112b05ab3..52b5755558900 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -5,6 +5,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -16,7 +17,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -52,6 +52,7 @@ if TYPE_CHECKING: from pandas._typing import ( + ExcelWriterMergeCells, FilePath, IndexLabel, StorageOptions, @@ -523,8 +524,11 @@ class ExcelFormatter: Column label for index column(s) if desired. If None is given, and `header` and `index` are True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. - merge_cells : bool, default False - Format MultiIndex and Hierarchical Rows as merged cells. + merge_cells : bool or 'columns', default False + Format MultiIndex column headers and Hierarchical Rows as merged cells + if True. Merge MultiIndex column headers only if 'columns'. + .. versionchanged:: 3.0.0 + Added the 'columns' option. inf_rep : str, default `'inf'` representation for np.inf values (which aren't representable in Excel) A `'-'` sign will be added in front of -inf. @@ -547,7 +551,7 @@ def __init__( header: Sequence[Hashable] | bool = True, index: bool = True, index_label: IndexLabel | None = None, - merge_cells: bool = False, + merge_cells: ExcelWriterMergeCells = False, inf_rep: str = "inf", style_converter: Callable | None = None, ) -> None: @@ -580,6 +584,9 @@ def __init__( self.index = index self.index_label = index_label self.header = header + + if not isinstance(merge_cells, bool) and merge_cells != "columns": + raise ValueError(f"Unexpected value for {merge_cells=}.") self.merge_cells = merge_cells self.inf_rep = inf_rep @@ -614,7 +621,7 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: columns = self.columns level_strs = columns._format_multi( - sparsify=self.merge_cells, include_names=False + sparsify=self.merge_cells in {True, "columns"}, include_names=False ) level_lengths = get_level_lengths(level_strs) coloffset = 0 @@ -623,7 +630,7 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: if self.index and isinstance(self.df.index, MultiIndex): coloffset = self.df.index.nlevels - 1 - if self.merge_cells: + if self.merge_cells in {True, "columns"}: # Format multi-index as a merged cells. for lnum, name in enumerate(columns.names): yield ExcelCell( @@ -793,7 +800,9 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # with index names (blank if None) for # unambiguous round-trip, unless not merging, # in which case the names all go on one row Issue #11328 - if isinstance(self.columns, MultiIndex) and self.merge_cells: + if isinstance(self.columns, MultiIndex) and ( + self.merge_cells in {True, "columns"} + ): self.rowcounter += 1 # if index labels are not empty go ahead and dump @@ -801,7 +810,7 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: for cidx, name in enumerate(index_labels): yield ExcelCell(self.rowcounter - 1, cidx, name, None) - if self.merge_cells: + if self.merge_cells and self.merge_cells != "columns": # Format hierarchical rows as merged cells. level_strs = self.df.index._format_multi( sparsify=True, include_names=False diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c503121328f53..9ad5ac83e9eae 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -6,6 +6,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Generator, Hashable, Mapping, @@ -22,7 +23,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Final, cast, ) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 0bd4f2935f4d0..67b5eb6f5ee5b 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -5,6 +5,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Iterable, Mapping, Sequence, @@ -13,7 +14,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, TypeVar, Union, ) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 8212b50594842..6f4c2fa6c6eae 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -9,7 +9,6 @@ import operator from typing import ( TYPE_CHECKING, - Callable, overload, ) @@ -55,6 +54,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) @@ -66,6 +66,7 @@ Axis, AxisInt, Concatenate, + ExcelWriterMergeCells, FilePath, IndexLabel, IntervalClosedType, @@ -551,7 +552,7 @@ def to_excel( startrow: int = 0, startcol: int = 0, engine: str | None = None, - merge_cells: bool = True, + merge_cells: ExcelWriterMergeCells = True, encoding: str | None = None, inf_rep: str = "inf", verbose: bool = True, diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 92afbc0e150ef..ec718f2a1276f 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1,13 +1,15 @@ from __future__ import annotations from collections import defaultdict -from collections.abc import Sequence +from collections.abc import ( + Callable, + Sequence, +) from functools import partial import re from typing import ( TYPE_CHECKING, Any, - Callable, DefaultDict, Optional, TypedDict, @@ -1050,7 +1052,7 @@ def format( When using a ``formatter`` string the dtypes must be compatible, otherwise a `ValueError` will be raised. - When instantiating a Styler, default formatting can be applied be setting the + When instantiating a Styler, default formatting can be applied by setting the ``pandas.options``: - ``styler.format.formatter``: default None. diff --git a/pandas/io/html.py b/pandas/io/html.py index 42f5266e7649b..4b8bc48130fab 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1178,7 +1178,10 @@ def read_html( **after** `skiprows` is applied. This function will *always* return a list of :class:`DataFrame` *or* - it will fail, e.g., it will *not* return an empty list. + it will fail, i.e., it will *not* return an empty list, save for some + rare cases. + It might return an empty list in case of inputs with single row and + ```` containing only whitespaces. Examples -------- diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 13d74e935f786..b29ead1d14b1d 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Generic, Literal, TypeVar, @@ -60,11 +59,13 @@ from pandas.io.json._table_schema import ( build_table_schema, parse_table_schema, + set_default_names, ) from pandas.io.parsers.readers import validate_integer if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Mapping, ) @@ -353,6 +354,8 @@ def __init__( raise ValueError(msg) self.schema = build_table_schema(obj, index=self.index) + if self.index: + obj = set_default_names(obj) # NotImplemented on a column MultiIndex if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): @@ -369,18 +372,22 @@ def __init__( msg = "Overlapping names between the index and columns" raise ValueError(msg) - obj = obj.copy() timedeltas = obj.select_dtypes(include=["timedelta"]).columns + copied = False if len(timedeltas): + obj = obj.copy() + copied = True obj[timedeltas] = obj[timedeltas].map(lambda x: x.isoformat()) - # Convert PeriodIndex to datetimes before serializing - if isinstance(obj.index.dtype, PeriodDtype): - obj.index = obj.index.to_timestamp() # exclude index from obj if index=False if not self.index: self.obj = obj.reset_index(drop=True) else: + # Convert PeriodIndex to datetimes before serializing + if isinstance(obj.index.dtype, PeriodDtype): + if not copied: + obj = obj.copy(deep=False) + obj.index = obj.index.to_timestamp() self.obj = obj.reset_index(drop=False) self.date_format = "iso" self.orient = "records" @@ -965,7 +972,7 @@ def read(self) -> DataFrame | Series: else: return obj - def _get_object_parser(self, json) -> DataFrame | Series: + def _get_object_parser(self, json: str) -> DataFrame | Series: """ Parses a json document into a pandas object. """ @@ -981,16 +988,14 @@ def _get_object_parser(self, json) -> DataFrame | Series: "date_unit": self.date_unit, "dtype_backend": self.dtype_backend, } - obj = None if typ == "frame": - obj = FrameParser(json, **kwargs).parse() - - if typ == "series" or obj is None: + return FrameParser(json, **kwargs).parse() + elif typ == "series": if not isinstance(dtype, bool): kwargs["dtype"] = dtype - obj = SeriesParser(json, **kwargs).parse() - - return obj + return SeriesParser(json, **kwargs).parse() + else: + raise ValueError(f"{typ=} must be 'frame' or 'series'.") def close(self) -> None: """ @@ -1103,7 +1108,6 @@ def __init__( self.convert_dates = convert_dates self.date_unit = date_unit self.keep_default_dates = keep_default_dates - self.obj: DataFrame | Series | None = None self.dtype_backend = dtype_backend @final @@ -1117,26 +1121,22 @@ def check_keys_split(self, decoded: dict) -> None: raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}") @final - def parse(self): - self._parse() + def parse(self) -> DataFrame | Series: + obj = self._parse() - if self.obj is None: - return None if self.convert_axes: - self._convert_axes() - self._try_convert_types() - return self.obj + obj = self._convert_axes(obj) + obj = self._try_convert_types(obj) + return obj - def _parse(self) -> None: + def _parse(self) -> DataFrame | Series: raise AbstractMethodError(self) @final - def _convert_axes(self) -> None: + def _convert_axes(self, obj: DataFrame | Series) -> DataFrame | Series: """ Try to convert axes. """ - obj = self.obj - assert obj is not None # for mypy for axis_name in obj._AXIS_ORDERS: ax = obj._get_axis(axis_name) ser = Series(ax, dtype=ax.dtype, copy=False) @@ -1149,9 +1149,10 @@ def _convert_axes(self) -> None: ) if result: new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False) - setattr(self.obj, axis_name, new_axis) + setattr(obj, axis_name, new_axis) + return obj - def _try_convert_types(self) -> None: + def _try_convert_types(self, obj): raise AbstractMethodError(self) @final @@ -1178,8 +1179,10 @@ def _try_convert_data( elif self.dtype is True: pass - else: - # dtype to force + elif not _should_convert_dates( + convert_dates, self.keep_default_dates, name + ): + # convert_dates takes precedence over columns listed in dtypes dtype = ( self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype ) @@ -1190,8 +1193,8 @@ def _try_convert_data( return data, False if convert_dates: - new_data, result = self._try_convert_to_date(data) - if result: + new_data = self._try_convert_to_date(data) + if new_data is not data: return new_data, True converted = False @@ -1241,16 +1244,16 @@ def _try_convert_data( return data, converted @final - def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: + def _try_convert_to_date(self, data: Series) -> Series: """ Try to parse a ndarray like into a date column. Try to coerce object in epoch/iso formats and integer/float in epoch - formats. Return a boolean if parsing was successful. + formats. """ # no conversion on empty if not len(data): - return data, False + return data new_data = data @@ -1261,7 +1264,7 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: try: new_data = data.astype("int64") except OverflowError: - return data, False + return data except (TypeError, ValueError): pass @@ -1273,57 +1276,45 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: | (new_data._values == iNaT) ) if not in_range.all(): - return data, False + return data date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS for date_unit in date_units: try: - new_data = to_datetime(new_data, errors="raise", unit=date_unit) + return to_datetime(new_data, errors="raise", unit=date_unit) except (ValueError, OverflowError, TypeError): continue - return new_data, True - return data, False + return data class SeriesParser(Parser): _default_orient = "index" _split_keys = ("name", "index", "data") - obj: Series | None - def _parse(self) -> None: + def _parse(self) -> Series: data = ujson_loads(self.json, precise_float=self.precise_float) if self.orient == "split": decoded = {str(k): v for k, v in data.items()} self.check_keys_split(decoded) - self.obj = Series(**decoded) + return Series(**decoded) else: - self.obj = Series(data) + return Series(data) - def _try_convert_types(self) -> None: - if self.obj is None: - return - obj, result = self._try_convert_data( - "data", self.obj, convert_dates=self.convert_dates - ) - if result: - self.obj = obj + def _try_convert_types(self, obj: Series) -> Series: + obj, _ = self._try_convert_data("data", obj, convert_dates=self.convert_dates) + return obj class FrameParser(Parser): _default_orient = "columns" _split_keys = ("columns", "index", "data") - obj: DataFrame | None - def _parse(self) -> None: + def _parse(self) -> DataFrame: json = self.json orient = self.orient - if orient == "columns": - self.obj = DataFrame( - ujson_loads(json, precise_float=self.precise_float), dtype=None - ) - elif orient == "split": + if orient == "split": decoded = { str(k): v for k, v in ujson_loads(json, precise_float=self.precise_float).items() @@ -1337,90 +1328,61 @@ def _parse(self) -> None: orig_names, is_potential_multi_index(orig_names, None), ) - self.obj = DataFrame(dtype=None, **decoded) + return DataFrame(dtype=None, **decoded) elif orient == "index": - self.obj = DataFrame.from_dict( + return DataFrame.from_dict( ujson_loads(json, precise_float=self.precise_float), dtype=None, orient="index", ) elif orient == "table": - self.obj = parse_table_schema(json, precise_float=self.precise_float) + return parse_table_schema(json, precise_float=self.precise_float) else: - self.obj = DataFrame( + # includes orient == "columns" + return DataFrame( ujson_loads(json, precise_float=self.precise_float), dtype=None ) - def _process_converter( - self, - f: Callable[[Hashable, Series], tuple[Series, bool]], - filt: Callable[[Hashable], bool] | None = None, - ) -> None: - """ - Take a conversion function and possibly recreate the frame. - """ - if filt is None: - filt = lambda col: True - - obj = self.obj - assert obj is not None # for mypy - - needs_new_obj = False - new_obj = {} - for i, (col, c) in enumerate(obj.items()): - if filt(col): - new_data, result = f(col, c) - if result: - c = new_data - needs_new_obj = True - new_obj[i] = c - - if needs_new_obj: - # possibly handle dup columns - new_frame = DataFrame(new_obj, index=obj.index) - new_frame.columns = obj.columns - self.obj = new_frame - - def _try_convert_types(self) -> None: - if self.obj is None: - return - if self.convert_dates: - self._try_convert_dates() - - self._process_converter( - lambda col, c: self._try_convert_data(col, c, convert_dates=False) + def _try_convert_types(self, obj: DataFrame) -> DataFrame: + arrays = [] + for col_label, series in obj.items(): + result, _ = self._try_convert_data( + col_label, + series, + convert_dates=_should_convert_dates( + self.convert_dates, + keep_default_dates=self.keep_default_dates, + col=col_label, + ), + ) + arrays.append(result.array) + return DataFrame._from_arrays( + arrays, obj.columns, obj.index, verify_integrity=False ) - def _try_convert_dates(self) -> None: - if self.obj is None: - return - - # our columns to parse - convert_dates_list_bool = self.convert_dates - if isinstance(convert_dates_list_bool, bool): - convert_dates_list_bool = [] - convert_dates = set(convert_dates_list_bool) - - def is_ok(col) -> bool: - """ - Return if this col is ok to try for a date parse. - """ - if col in convert_dates: - return True - if not self.keep_default_dates: - return False - if not isinstance(col, str): - return False - - col_lower = col.lower() - if ( - col_lower.endswith(("_at", "_time")) - or col_lower == "modified" - or col_lower == "date" - or col_lower == "datetime" - or col_lower.startswith("timestamp") - ): - return True - return False - self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok) +def _should_convert_dates( + convert_dates: bool | list[str], + keep_default_dates: bool, + col: Hashable, +) -> bool: + """ + Return bool whether a DataFrame column should be cast to datetime. + """ + if convert_dates is False: + # convert_dates=True means follow keep_default_dates + return False + elif not isinstance(convert_dates, bool) and col in set(convert_dates): + return True + elif not keep_default_dates: + return False + elif not isinstance(col, str): + return False + col_lower = col.lower() + if ( + col_lower.endswith(("_at", "_time")) + or col_lower in {"modified", "date", "datetime"} + or col_lower.startswith("timestamp") + ): + return True + return False diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index d4b412404c308..d966e38fa11a5 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -114,7 +114,7 @@ def set_default_names(data): ) return data - data = data.copy() + data = data.copy(deep=False) if data.index.nlevels > 1: data.index.names = com.fill_missing_names(data.index.names) else: @@ -275,7 +275,7 @@ def build_table_schema( >>> df = pd.DataFrame( ... {'A': [1, 2, 3], ... 'B': ['a', 'b', 'c'], - ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), + ... 'C': pd.date_range('2016-01-01', freq='D', periods=3), ... }, index=pd.Index(range(3), name='idx')) >>> build_table_schema(df) {'fields': \ diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c6cc85b9f722b..e8faea76897c6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, final, overload, @@ -29,27 +28,19 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, is_dict_like, - is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_list_like, is_object_dtype, is_string_dtype, - pandas_dtype, -) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - ExtensionDtype, ) from pandas.core.dtypes.missing import isna from pandas import ( - ArrowDtype, DataFrame, DatetimeIndex, StringDtype, @@ -59,12 +50,9 @@ ArrowExtensionArray, BaseMaskedArray, BooleanArray, - Categorical, - ExtensionArray, FloatingArray, IntegerArray, ) -from pandas.core.arrays.boolean import BooleanDtype from pandas.core.indexes.api import ( Index, MultiIndex, @@ -78,6 +66,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterable, Mapping, Sequence, @@ -86,7 +75,6 @@ from pandas._typing import ( ArrayLike, DtypeArg, - DtypeObj, Hashable, HashableT, Scalar, @@ -127,7 +115,6 @@ def __init__(self, kwds) -> None: "for the 'parse_dates' parameter" ) self.parse_dates: bool | list = parse_dates - self._parse_date_cols: set = set() self.date_parser = kwds.pop("date_parser", lib.no_default) self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) @@ -145,12 +132,6 @@ def __init__(self, kwds) -> None: self.false_values = kwds.get("false_values") self.cache_dates = kwds.pop("cache_dates", True) - self._date_conv = _make_date_converter( - date_format=self.date_format, - dayfirst=self.dayfirst, - cache_dates=self.cache_dates, - ) - # validate header options for mi self.header = kwds.get("header") if is_list_like(self.header, allow_sets=False): @@ -181,58 +162,12 @@ def __init__(self, kwds) -> None: self._first_chunk = True - self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) # Normally, this arg would get pre-processed earlier on self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) - def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set: - """ - Check if parse_dates are in columns. - - If user has provided names for parse_dates, check if those columns - are available. - - Parameters - ---------- - columns : list - List of names of the dataframe. - - Returns - ------- - The names of the columns which will get parsed later if a list - is given as specification. - - Raises - ------ - ValueError - If column to parse_date is not in dataframe. - - """ - if not isinstance(self.parse_dates, list): - return set() - - # get only columns that are references using names (str), not by index - missing_cols = ", ".join( - sorted( - { - col - for col in self.parse_dates - if isinstance(col, str) and col not in columns - } - ) - ) - if missing_cols: - raise ValueError( - f"Missing column provided to 'parse_dates': '{missing_cols}'" - ) - # Convert positions to actual column names - return { - col if (isinstance(col, str) or col in columns) else columns[col] - for col in self.parse_dates - } - def close(self) -> None: pass @@ -404,9 +339,12 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: for i, arr in enumerate(index): if try_parse_dates and self._should_parse_dates(i): - arr = self._date_conv( + arr = date_converter( arr, col=self.index_names[i] if self.index_names is not None else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) if self.na_filter: @@ -420,7 +358,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: assert self.index_names is not None col_name = self.index_names[i] if col_name is not None: - col_na_values, col_na_fvalues = _get_na_values( + col_na_values, col_na_fvalues = get_na_values( col_name, self.na_values, self.na_fvalues, self.keep_default_na ) else: @@ -451,90 +389,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: return index - @final - def _convert_to_ndarrays( - self, - dct: Mapping, - na_values, - na_fvalues, - converters=None, - dtypes=None, - ) -> dict[Any, np.ndarray]: - result = {} - for c, values in dct.items(): - conv_f = None if converters is None else converters.get(c, None) - if isinstance(dtypes, dict): - cast_type = dtypes.get(c, None) - else: - # single dtype or None - cast_type = dtypes - - if self.na_filter: - col_na_values, col_na_fvalues = _get_na_values( - c, na_values, na_fvalues, self.keep_default_na - ) - else: - col_na_values, col_na_fvalues = set(), set() - - if c in self._parse_date_cols: - # GH#26203 Do not convert columns which get converted to dates - # but replace nans to ensure to_datetime works - mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) - np.putmask(values, mask, np.nan) - result[c] = values - continue - - if conv_f is not None: - # conv_f applied to data before inference - if cast_type is not None: - warnings.warn( - ( - "Both a converter and dtype were specified " - f"for column {c} - only the converter will be used." - ), - ParserWarning, - stacklevel=find_stack_level(), - ) - - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = algorithms.isin(values, list(na_values)).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool=False, - ) - else: - is_ea = is_extension_array_dtype(cast_type) - is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) - # skip inference if specified dtype is object - # or casting to an EA - try_num_bool = not (cast_type and is_str_or_ea_dtype) - - # general type inference and conversion - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool, - ) - - # type specified in dtype param or cast_type is an EA - if cast_type is not None: - cast_type = pandas_dtype(cast_type) - if cast_type and (cvals.dtype != cast_type or is_ea): - if not is_ea and na_count > 0: - if is_bool_dtype(cast_type): - raise ValueError(f"Bool column has NA values in column {c}") - cvals = self._cast_types(cvals, cast_type, c) - - result[c] = cvals - return result - @final def _set_noconvert_dtype_columns( self, col_indices: list[int], names: Sequence[Hashable] @@ -580,6 +434,7 @@ def _set(x) -> int: return x if isinstance(self.parse_dates, list): + validate_parse_dates_presence(self.parse_dates, names) for val in self.parse_dates: noconvert_columns.add(_set(val)) @@ -705,78 +560,6 @@ def _infer_types( return result, na_count - @final - def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: - """ - Cast values to specified type - - Parameters - ---------- - values : ndarray or ExtensionArray - cast_type : np.dtype or ExtensionDtype - dtype to cast values to - column : string - column name - used only for error reporting - - Returns - ------- - converted : ndarray or ExtensionArray - """ - if isinstance(cast_type, CategoricalDtype): - known_cats = cast_type.categories is not None - - if not is_object_dtype(values.dtype) and not known_cats: - # TODO: this is for consistency with - # c-parser which parses all categories - # as strings - values = lib.ensure_string_array( - values, skipna=False, convert_na_value=False - ) - - cats = Index(values).unique().dropna() - values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type, true_values=self.true_values - ) - - # use the EA's implementation of casting - elif isinstance(cast_type, ExtensionDtype): - array_type = cast_type.construct_array_type() - try: - if isinstance(cast_type, BooleanDtype): - # error: Unexpected keyword argument "true_values" for - # "_from_sequence_of_strings" of "ExtensionArray" - return array_type._from_sequence_of_strings( # type: ignore[call-arg] - values, - dtype=cast_type, - true_values=self.true_values, - false_values=self.false_values, - ) - else: - return array_type._from_sequence_of_strings(values, dtype=cast_type) - except NotImplementedError as err: - raise NotImplementedError( - f"Extension Array: {array_type} must implement " - "_from_sequence_of_strings in order to be used in parser methods" - ) from err - - elif isinstance(values, ExtensionArray): - values = values.astype(cast_type, copy=False) - elif issubclass(cast_type.type, str): - # TODO: why skipna=True here and False above? some tests depend - # on it here, but nothing fails if we change it above - # (as no tests get there as of 2022-12-06) - values = lib.ensure_string_array( - values, skipna=True, convert_na_value=False - ) - else: - try: - values = astype_array(values, cast_type, copy=True) - except ValueError as err: - raise ValueError( - f"Unable to convert column {column} to type {cast_type}" - ) from err - return values - @overload def _do_date_conversions( self, @@ -797,16 +580,25 @@ def _do_date_conversions( names: Sequence[Hashable] | Index, data: Mapping[Hashable, ArrayLike] | DataFrame, ) -> Mapping[Hashable, ArrayLike] | DataFrame: - if isinstance(self.parse_dates, list): - return _process_date_conversion( - data, - self._date_conv, - self.parse_dates, - self.index_col, - self.index_names, - names, - dtype_backend=self.dtype_backend, + if not isinstance(self.parse_dates, list): + return data + for colspec in self.parse_dates: + if isinstance(colspec, int) and colspec not in data: + colspec = names[colspec] + if (isinstance(self.index_col, list) and colspec in self.index_col) or ( + isinstance(self.index_names, list) and colspec in self.index_names + ): + continue + result = date_converter( + data[colspec], + col=colspec, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) + # error: Unsupported target for indexed assignment + # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") + data[colspec] = result # type: ignore[index] return data @@ -901,56 +693,6 @@ def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> Sequen return usecols - @final - def _validate_usecols_arg(self, usecols): - """ - Validate the 'usecols' parameter. - - Checks whether or not the 'usecols' parameter contains all integers - (column selection by index), strings (column by name) or is a callable. - Raises a ValueError if that is not the case. - - Parameters - ---------- - usecols : list-like, callable, or None - List of columns to use when parsing or a callable that can be used - to filter a list of table columns. - - Returns - ------- - usecols_tuple : tuple - A tuple of (verified_usecols, usecols_dtype). - - 'verified_usecols' is either a set if an array-like is passed in or - 'usecols' if a callable or None is passed in. - - 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like - is passed in or None if a callable or None is passed in. - """ - msg = ( - "'usecols' must either be list-like of all strings, all unicode, " - "all integers or a callable." - ) - if usecols is not None: - if callable(usecols): - return usecols, None - - if not is_list_like(usecols): - # see gh-20529 - # - # Ensure it is iterable container but not string. - raise ValueError(msg) - - usecols_dtype = lib.infer_dtype(usecols, skipna=False) - - if usecols_dtype not in ("empty", "integer", "string"): - raise ValueError(msg) - - usecols = set(usecols) - - return usecols, usecols_dtype - return usecols, None - @final def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]: if not is_index_col(index_col): @@ -1040,40 +782,37 @@ def _get_empty_meta( return index, columns, col_dict -def _make_date_converter( +def date_converter( + date_col, + col: Hashable, dayfirst: bool = False, cache_dates: bool = True, date_format: dict[Hashable, str] | str | None = None, ): - def converter(date_col, col: Hashable): - if date_col.dtype.kind in "Mm": - return date_col - - date_fmt = ( - date_format.get(col) if isinstance(date_format, dict) else date_format + if date_col.dtype.kind in "Mm": + return date_col + + date_fmt = date_format.get(col) if isinstance(date_format, dict) else date_format + + str_objs = lib.ensure_string_array(np.asarray(date_col)) + try: + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, ) + except (ValueError, TypeError): + # test_usecols_with_parse_dates4 + # test_multi_index_parse_dates + return str_objs - str_objs = lib.ensure_string_array(date_col) - try: - result = tools.to_datetime( - str_objs, - format=date_fmt, - utc=False, - dayfirst=dayfirst, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_usecols_with_parse_dates4 - # test_multi_index_parse_dates - return str_objs - - if isinstance(result, DatetimeIndex): - arr = result.to_numpy() - arr.flags.writeable = True - return arr - return result._values - - return converter + if isinstance(result, DatetimeIndex): + arr = result.to_numpy() + arr.flags.writeable = True + return arr + return result._values parser_defaults = { @@ -1116,43 +855,7 @@ def converter(date_col, col: Hashable): } -def _process_date_conversion( - data_dict: Mapping[Hashable, ArrayLike] | DataFrame, - converter: Callable, - parse_spec: list, - index_col, - index_names, - columns: Sequence[Hashable] | Index, - dtype_backend=lib.no_default, -) -> Mapping[Hashable, ArrayLike] | DataFrame: - for colspec in parse_spec: - if isinstance(colspec, int) and colspec not in data_dict: - colspec = columns[colspec] - if (isinstance(index_col, list) and colspec in index_col) or ( - isinstance(index_names, list) and colspec in index_names - ): - continue - elif dtype_backend == "pyarrow": - import pyarrow as pa - - dtype = data_dict[colspec].dtype - if isinstance(dtype, ArrowDtype) and ( - pa.types.is_timestamp(dtype.pyarrow_dtype) - or pa.types.is_date(dtype.pyarrow_dtype) - ): - continue - - # Pyarrow engine returns Series which we need to convert to - # numpy array before converter, its a no-op for other parsers - result = converter(np.asarray(data_dict[colspec]), col=colspec) - # error: Unsupported target for indexed assignment - # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") - data_dict[colspec] = result # type: ignore[index] - - return data_dict - - -def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): +def get_na_values(col, na_values, na_fvalues, keep_default_na: bool): """ Get the NaN values for a given column. @@ -1189,3 +892,99 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): def is_index_col(col) -> bool: return col is not None and col is not False + + +def validate_parse_dates_presence( + parse_dates: bool | list, columns: Sequence[Hashable] +) -> set: + """ + Check if parse_dates are in columns. + + If user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + List of names of the dataframe. + + Returns + ------- + The names of the columns which will get parsed later if a list + is given as specification. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + if not isinstance(parse_dates, list): + return set() + + missing = set() + unique_cols = set() + for col in parse_dates: + if isinstance(col, str): + if col not in columns: + missing.add(col) + else: + unique_cols.add(col) + elif col in columns: + unique_cols.add(col) + else: + unique_cols.add(columns[col]) + if missing: + missing_cols = ", ".join(sorted(missing)) + raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'") + return unique_cols + + +def _validate_usecols_arg(usecols): + """ + Validate the 'usecols' parameter. + + Checks whether or not the 'usecols' parameter contains all integers + (column selection by index), strings (column by name) or is a callable. + Raises a ValueError if that is not the case. + + Parameters + ---------- + usecols : list-like, callable, or None + List of columns to use when parsing or a callable that can be used + to filter a list of table columns. + + Returns + ------- + usecols_tuple : tuple + A tuple of (verified_usecols, usecols_dtype). + + 'verified_usecols' is either a set if an array-like is passed in or + 'usecols' if a callable or None is passed in. + + 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like + is passed in or None if a callable or None is passed in. + """ + msg = ( + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." + ) + if usecols is not None: + if callable(usecols): + return usecols, None + + if not is_list_like(usecols): + # see gh-20529 + # + # Ensure it is iterable container but not string. + raise ValueError(msg) + + usecols_dtype = lib.infer_dtype(usecols, skipna=False) + + if usecols_dtype not in ("empty", "integer", "string"): + raise ValueError(msg) + + usecols = set(usecols) + + return usecols, usecols_dtype + return usecols, None diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 4de626288aa41..b59a778624c49 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -30,7 +30,9 @@ from pandas.io.parsers.base_parser import ( ParserBase, ParserError, + date_converter, is_index_col, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -160,7 +162,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: ) # error: Cannot determine type of 'names' - self._validate_parse_dates_presence(self.names) # type: ignore[has-type] + validate_parse_dates_presence(self.parse_dates, self.names) # type: ignore[has-type] self._set_noconvert_columns() # error: Cannot determine type of 'names' @@ -344,9 +346,12 @@ def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]: def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): if try_parse_dates and self._should_parse_dates(index): - values = self._date_conv( + values = date_converter( values, col=self.index_names[index] if self.index_names is not None else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) return values diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index f7d2aa2419429..05fe963e9b2b7 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -10,9 +10,11 @@ from typing import ( IO, TYPE_CHECKING, + Any, DefaultDict, Literal, cast, + final, ) import warnings @@ -27,20 +29,39 @@ from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, + is_extension_array_dtype, is_integer, is_numeric_dtype, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, ) from pandas.core.dtypes.inference import is_dict_like +from pandas.core import algorithms +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.indexes.api import Index + from pandas.io.common import ( dedup_names, is_potential_multi_index, ) from pandas.io.parsers.base_parser import ( ParserBase, + get_na_values, parser_defaults, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -53,13 +74,13 @@ from pandas._typing import ( ArrayLike, + DtypeObj, ReadCsvBuffer, Scalar, T, ) from pandas import ( - Index, MultiIndex, Series, ) @@ -157,7 +178,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: if self._col_indices is None: self._col_indices = list(range(len(self.columns))) - self._parse_date_cols = self._validate_parse_dates_presence(self.columns) self._no_thousands_columns = self._set_no_thousand_columns() if len(self.decimal) != 1: @@ -370,6 +390,165 @@ def _convert_data( clean_dtypes, ) + @final + def _convert_to_ndarrays( + self, + dct: Mapping, + na_values, + na_fvalues, + converters=None, + dtypes=None, + ) -> dict[Any, np.ndarray]: + result = {} + parse_date_cols = validate_parse_dates_presence(self.parse_dates, self.columns) + for c, values in dct.items(): + conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes + + if self.na_filter: + col_na_values, col_na_fvalues = get_na_values( + c, na_values, na_fvalues, self.keep_default_na + ) + else: + col_na_values, col_na_fvalues = set(), set() + + if c in parse_date_cols: + # GH#26203 Do not convert columns which get converted to dates + # but replace nans to ensure to_datetime works + mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) # pyright: ignore[reportArgumentType] + np.putmask(values, mask, np.nan) + result[c] = values + continue + + if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn( + ( + "Both a converter and dtype were specified " + f"for column {c} - only the converter will be used." + ), + ParserWarning, + stacklevel=find_stack_level(), + ) + + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = algorithms.isin(values, list(na_values)).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool=False, + ) + else: + is_ea = is_extension_array_dtype(cast_type) + is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) + # skip inference if specified dtype is object + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool, + ) + + # type specified in dtype param or cast_type is an EA + if cast_type is not None: + cast_type = pandas_dtype(cast_type) + if cast_type and (cvals.dtype != cast_type or is_ea): + if not is_ea and na_count > 0: + if is_bool_dtype(cast_type): + raise ValueError(f"Bool column has NA values in column {c}") + cvals = self._cast_types(cvals, cast_type, c) + + result[c] = cvals + return result + + @final + def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray or ExtensionArray + cast_type : np.dtype or ExtensionDtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray or ExtensionArray + """ + if isinstance(cast_type, CategoricalDtype): + known_cats = cast_type.categories is not None + + if not is_object_dtype(values.dtype) and not known_cats: + # TODO: this is for consistency with + # c-parser which parses all categories + # as strings + values = lib.ensure_string_array( + values, skipna=False, convert_na_value=False + ) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) + + # use the EA's implementation of casting + elif isinstance(cast_type, ExtensionDtype): + array_type = cast_type.construct_array_type() + try: + if isinstance(cast_type, BooleanDtype): + # error: Unexpected keyword argument "true_values" for + # "_from_sequence_of_strings" of "ExtensionArray" + values_str = [str(val) for val in values] + return array_type._from_sequence_of_strings( # type: ignore[call-arg] + values_str, + dtype=cast_type, + true_values=self.true_values, # pyright: ignore[reportCallIssue] + false_values=self.false_values, # pyright: ignore[reportCallIssue] + none_values=self.na_values, # pyright: ignore[reportCallIssue] + ) + else: + return array_type._from_sequence_of_strings(values, dtype=cast_type) + except NotImplementedError as err: + raise NotImplementedError( + f"Extension Array: {array_type} must implement " + "_from_sequence_of_strings in order to be used in parser methods" + ) from err + + elif isinstance(values, ExtensionArray): + values = values.astype(cast_type, copy=False) + elif issubclass(cast_type.type, str): + # TODO: why skipna=True here and False above? some tests depend + # on it here, but nothing fails if we change it above + # (as no tests get there as of 2022-12-06) + values = lib.ensure_string_array( + values, skipna=True, convert_na_value=False + ) + else: + try: + values = astype_array(values, cast_type, copy=True) + except ValueError as err: + raise ValueError( + f"Unable to convert column {column} to type {cast_type}" + ) from err + return values + @cache_readonly def _have_mi_columns(self) -> bool: if self.header is None: diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 66edbcaa755ed..c28d3aaaf4748 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -17,7 +17,6 @@ IO, TYPE_CHECKING, Any, - Callable, Generic, Literal, TypedDict, @@ -70,6 +69,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -674,6 +674,14 @@ def _read( # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) chunksize = kwds.get("chunksize", None) + + # Check type of encoding_errors + errors = kwds.get("encoding_errors", "strict") + if not isinstance(errors, str): + raise ValueError( + f"encoding_errors must be a string, got {type(errors).__name__}" + ) + if kwds.get("engine") == "pyarrow": if iterator: raise ValueError( @@ -1534,7 +1542,10 @@ def get_chunk(self, size: int | None = None) -> DataFrame: if self.nrows is not None: if self._currow >= self.nrows: raise StopIteration - size = min(size, self.nrows - self._currow) + if size is None: + size = self.nrows - self._currow + else: + size = min(size, self.nrows - self._currow) return self.read(nrows=size) def __enter__(self) -> Self: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4fce338ccad6f..1420ce84b4db8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -18,7 +18,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Final, Literal, cast, @@ -102,6 +101,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, Sequence, @@ -2655,7 +2655,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): # reverse converts if dtype.startswith("datetime64"): # recreate with tz if indicated - converted = _set_tz(converted, tz) + converted = _set_tz(converted, tz, dtype) elif dtype == "timedelta64": converted = np.asarray(converted, dtype="m8[ns]") @@ -3036,7 +3036,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) - ret = _set_tz(ret, tz) + ret = _set_tz(ret, tz, dtype) elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") @@ -4964,7 +4964,9 @@ def _get_tz(tz: tzinfo) -> str | tzinfo: return zone -def _set_tz(values: npt.NDArray[np.int64], tz: str | tzinfo | None) -> DatetimeArray: +def _set_tz( + values: npt.NDArray[np.int64], tz: str | tzinfo | None, datetime64_dtype: str +) -> DatetimeArray: """ Coerce the values to a DatetimeArray with appropriate tz. @@ -4972,11 +4974,13 @@ def _set_tz(values: npt.NDArray[np.int64], tz: str | tzinfo | None) -> DatetimeA ---------- values : ndarray[int64] tz : str, tzinfo, or None + datetime64_dtype : str, e.g. "datetime64[ns]", "datetime64[25s]" """ assert values.dtype == "i8", values.dtype # Argument "tz" to "tz_to_dtype" has incompatible type "str | tzinfo | None"; # expected "tzinfo" - dtype = tz_to_dtype(tz=tz, unit="ns") # type: ignore[arg-type] + unit, _ = np.datetime_data(datetime64_dtype) # parsing dtype: unit, count + dtype = tz_to_dtype(tz=tz, unit=unit) # type: ignore[arg-type] dta = DatetimeArray._from_sequence(values, dtype=dtype) return dta diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 874320f08fb75..41b368c9b05c2 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -23,7 +23,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -67,6 +66,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Iterator, Mapping, @@ -1014,7 +1014,7 @@ def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: def insert_data(self) -> tuple[list[str], list[np.ndarray]]: if self.index is not None: - temp = self.frame.copy() + temp = self.frame.copy(deep=False) temp.index.names = self.index try: temp.reset_index(inplace=True) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d1e57ad568ba5..5146876d20374 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -25,7 +25,6 @@ IO, TYPE_CHECKING, AnyStr, - Callable, Final, cast, ) @@ -74,6 +73,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index a6cd06cd61687..8c7381a926e72 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) from pandas._libs import lib @@ -35,7 +34,10 @@ from pandas.io.parsers import TextParser if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from xml.etree.ElementTree import Element from lxml import etree diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 0daf3cfafe81c..61c44e58b643a 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -3,7 +3,6 @@ import importlib from typing import ( TYPE_CHECKING, - Callable, Literal, ) @@ -27,6 +26,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) @@ -982,10 +982,7 @@ def __call__(self, *args, **kwargs): f"Valid plot kinds: {self._all_kinds}" ) - # The original data structured can be transformed before passed to the - # backend. For example, for DataFrame is common to set the index as the - # `x` parameter, and return a Series with the parameter `y` as values. - data = self._parent.copy() + data = self._parent if isinstance(data, ABCSeries): kwargs["reuse_plot"] = True @@ -1005,7 +1002,7 @@ def __call__(self, *args, **kwargs): if is_integer(y) and not holds_integer(data.columns): y = data.columns[y] # converted to series actually. copy to not modify - data = data[y].copy() + data = data[y].copy(deep=False) data.index.name = y elif isinstance(data, ABCDataFrame): data_cols = data.columns @@ -1032,8 +1029,7 @@ def __call__(self, *args, **kwargs): except (IndexError, KeyError, TypeError): pass - # don't overwrite - data = data[y].copy() + data = data[y] if isinstance(data, ABCSeries): label_name = label_kw or y diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 11c0ba01fff64..6bb10068bee38 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -311,8 +311,6 @@ def _grouped_plot_by_column( layout=layout, ) - _axes = flatten_axes(axes) - # GH 45465: move the "by" label based on "vert" xlabel, ylabel = kwargs.pop("xlabel", None), kwargs.pop("ylabel", None) if kwargs.get("vert", True): @@ -322,8 +320,7 @@ def _grouped_plot_by_column( ax_values = [] - for i, col in enumerate(columns): - ax = _axes[i] + for ax, col in zip(flatten_axes(axes), columns): gp_col = grouped[col] keys, values = zip(*gp_col) re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs) @@ -531,10 +528,8 @@ def boxplot_frame_groupby( figsize=figsize, layout=layout, ) - axes = flatten_axes(axes) - data = {} - for (key, group), ax in zip(grouped, axes): + for (key, group), ax in zip(grouped, flatten_axes(axes)): d = group.boxplot( ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds ) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index a8f08769ceae2..fc63d65f1e160 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -556,7 +556,8 @@ def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: return ppd, ppm, ppy -def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _daily_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # error: "BaseOffset" has no attribute "_period_dtype_code" dtype_code = freq._period_dtype_code # type: ignore[attr-defined] @@ -755,7 +756,8 @@ def _second_finder(label_interval: int) -> None: return info -def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _monthly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin @@ -826,7 +828,8 @@ def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _quarterly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) @@ -873,7 +876,8 @@ def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _annual_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _annual_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # Note: small difference here vs other finders in adding 1 to vmax (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 2d3c81f2512aa..8b108346160d6 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -586,7 +586,7 @@ def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: fig.set_size_inches(self.figsize) axes = self.ax - axes = flatten_axes(axes) + axes = np.fromiter(flatten_axes(axes), dtype=object) if self.logx is True or self.loglog is True: [a.set_xscale("log") for a in axes] @@ -893,7 +893,13 @@ def _make_legend(self) -> None: elif self.subplots and self.legend: for ax in self.axes: if ax.get_visible(): - ax.legend(loc="best") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "No artists with labels found to put in legend.", + UserWarning, + ) + ax.legend(loc="best") @final @staticmethod diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index ca635386be335..2c4d714bf1a0c 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -95,11 +95,12 @@ def _adjust_bins(self, bins: int | np.ndarray | list[np.ndarray]): def _calculate_bins(self, data: Series | DataFrame, bins) -> np.ndarray: """Calculate bins given data""" nd_values = data.infer_objects()._get_numeric_data() - values = np.ravel(nd_values) + values = nd_values.values + if nd_values.ndim == 2: + values = values.reshape(-1) values = values[~isna(values)] - hist, bins = np.histogram(values, bins=bins, range=self._bin_range) - return bins + return np.histogram_bin_edges(values, bins=bins, range=self._bin_range) # error: Signature of "_plot" incompatible with supertype "LinePlot" @classmethod @@ -322,10 +323,7 @@ def _grouped_plot( naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout ) - _axes = flatten_axes(axes) - - for i, (key, group) in enumerate(grouped): - ax = _axes[i] + for ax, (key, group) in zip(flatten_axes(axes), grouped): if numeric_only and isinstance(group, ABCDataFrame): group = group._get_numeric_data() plotf(group, ax, **kwargs) @@ -557,12 +555,9 @@ def hist_frame( figsize=figsize, layout=layout, ) - _axes = flatten_axes(axes) - can_set_label = "label" not in kwds - for i, col in enumerate(data.columns): - ax = _axes[i] + for ax, col in zip(flatten_axes(axes), data.columns): if legend and can_set_label: kwds["label"] = col ax.hist(data[col].dropna().values, bins=bins, **kwds) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index ae82f0232aee0..f9c370b2486fd 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -18,7 +18,10 @@ ) if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import ( + Generator, + Iterable, + ) from matplotlib.axes import Axes from matplotlib.axis import Axis @@ -231,7 +234,7 @@ def create_subplots( else: if is_list_like(ax): if squeeze: - ax = flatten_axes(ax) + ax = np.fromiter(flatten_axes(ax), dtype=object) if layout is not None: warnings.warn( "When passing multiple axes, layout keyword is ignored.", @@ -260,7 +263,7 @@ def create_subplots( if squeeze: return fig, ax else: - return fig, flatten_axes(ax) + return fig, np.fromiter(flatten_axes(ax), dtype=object) else: warnings.warn( "To output multiple subplots, the figure containing " @@ -439,12 +442,13 @@ def handle_shared_axes( _remove_labels_from_axis(ax.yaxis) -def flatten_axes(axes: Axes | Iterable[Axes]) -> np.ndarray: +def flatten_axes(axes: Axes | Iterable[Axes]) -> Generator[Axes, None, None]: if not is_list_like(axes): - return np.array([axes]) + yield axes # type: ignore[misc] elif isinstance(axes, (np.ndarray, ABCIndex)): - return np.asarray(axes).ravel() - return np.array(axes) + yield from np.asarray(axes).reshape(-1) + else: + yield from axes # type: ignore[misc] def set_ticks_props( @@ -456,13 +460,13 @@ def set_ticks_props( ): for ax in flatten_axes(axes): if xlabelsize is not None: - mpl.artist.setp(ax.get_xticklabels(), fontsize=xlabelsize) + mpl.artist.setp(ax.get_xticklabels(), fontsize=xlabelsize) # type: ignore[arg-type] if xrot is not None: - mpl.artist.setp(ax.get_xticklabels(), rotation=xrot) + mpl.artist.setp(ax.get_xticklabels(), rotation=xrot) # type: ignore[arg-type] if ylabelsize is not None: - mpl.artist.setp(ax.get_yticklabels(), fontsize=ylabelsize) + mpl.artist.setp(ax.get_yticklabels(), fontsize=ylabelsize) # type: ignore[arg-type] if yrot is not None: - mpl.artist.setp(ax.get_yticklabels(), rotation=yrot) + mpl.artist.setp(ax.get_yticklabels(), rotation=yrot) # type: ignore[arg-type] return axes diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index cbc68265a1cc1..939997f44c1a9 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -63,16 +63,60 @@ def test_apply(float_frame, engine, request): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) -def test_apply_args(float_frame, axis, raw, engine, request): - if engine == "numba": - mark = pytest.mark.xfail(reason="numba engine doesn't support args") - request.node.add_marker(mark) +@pytest.mark.parametrize("nopython", [True, False]) +def test_apply_args(float_frame, axis, raw, engine, nopython): + engine_kwargs = {"nopython": nopython} result = float_frame.apply( - lambda x, y: x + y, axis, args=(1,), raw=raw, engine=engine + lambda x, y: x + y, + axis, + args=(1,), + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, ) expected = float_frame + 1 tm.assert_frame_equal(result, expected) + # GH:58712 + result = float_frame.apply( + lambda x, a, b: x + a + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + expected = float_frame + 3 + tm.assert_frame_equal(result, expected) + + if engine == "numba": + # keyword-only arguments are not supported in numba + with pytest.raises( + pd.errors.NumbaUtilError, + match="numba does not support keyword-only arguments", + ): + float_frame.apply( + lambda x, a, *, b: x + a + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + with pytest.raises( + pd.errors.NumbaUtilError, + match="numba does not support keyword-only arguments", + ): + float_frame.apply( + lambda *x, b: x[0] + x[1] + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + def test_apply_categorical_func(): # GH 9573 diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index f9807310460b4..cfc93ecae295d 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -5,6 +5,7 @@ datetime, time, timedelta, + timezone, ) from itertools import ( product, @@ -14,7 +15,6 @@ import numpy as np import pytest -import pytz from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.offsets import shift_months @@ -1870,8 +1870,10 @@ def test_dt64tz_series_sub_dtitz(self): def test_sub_datetime_compat(self, unit): # see GH#14088 - ser = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), NaT]).dt.as_unit(unit) - dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) + ser = Series([datetime(2016, 8, 23, 12, tzinfo=timezone.utc), NaT]).dt.as_unit( + unit + ) + dt = datetime(2016, 8, 22, 12, tzinfo=timezone.utc) # The datetime object has "us" so we upcast lower units exp_unit = tm.get_finest_unit(unit, "us") exp = Series([Timedelta("1 days"), NaT]).dt.as_unit(exp_unit) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 539df9d61a7b2..67762e0b89c73 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1086,7 +1086,7 @@ def test_parr_add_timedeltalike_minute_gt1(self, three_days, box_with_array): with pytest.raises(TypeError, match=msg): other - rng - @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5d"]) + @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5D"]) def test_parr_add_timedeltalike_tick_gt1(self, three_days, freqstr, box_with_array): # GH#23031 adding a time-delta-like offset to a PeriodArray that has # tick-like frequency with n != 1 diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 3c677142846d7..7f3e8d3ed6e6e 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -6,106 +6,66 @@ @pytest.mark.parametrize( - "to_replace,value,expected,flip_categories", + "to_replace,value,expected", [ # one-to-one - (1, 2, [2, 2, 3], False), - (1, 4, [4, 2, 3], False), - (4, 1, [1, 2, 3], False), - (5, 6, [1, 2, 3], False), + (4, 1, [1, 2, 3]), + (3, 1, [1, 2, 1]), # many-to-one - ([1], 2, [2, 2, 3], False), - ([1, 2], 3, [3, 3, 3], False), - ([1, 2], 4, [4, 4, 3], False), - ((1, 2, 4), 5, [5, 5, 3], False), - ((5, 6), 2, [1, 2, 3], False), - ([1], [2], [2, 2, 3], False), - ([1, 4], [5, 2], [5, 2, 3], False), - # GH49404: overlap between to_replace and value - ([1, 2, 3], [2, 3, 4], [2, 3, 4], False), - # GH50872, GH46884: replace with null - (1, None, [None, 2, 3], False), - (1, pd.NA, [None, 2, 3], False), - # check_categorical sorts categories, which crashes on mixed dtypes - (3, "4", [1, 2, "4"], False), - ([1, 2, "3"], "5", ["5", "5", 3], True), + ((5, 6), 2, [1, 2, 3]), + ((3, 2), 1, [1, 1, 1]), ], ) -@pytest.mark.filterwarnings( - "ignore:.*with CategoricalDtype is deprecated:FutureWarning" -) -def test_replace_categorical_series(to_replace, value, expected, flip_categories): +def test_replace_categorical_series(to_replace, value, expected): # GH 31720 - ser = pd.Series([1, 2, 3], dtype="category") result = ser.replace(to_replace, value) - expected = pd.Series(expected, dtype="category") - ser.replace(to_replace, value, inplace=True) - - if flip_categories: - expected = expected.cat.set_categories(expected.cat.categories[::-1]) - - tm.assert_series_equal(expected, result, check_category_order=False) - tm.assert_series_equal(expected, ser, check_category_order=False) + expected = pd.Series(Categorical(expected, categories=[1, 2, 3])) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "to_replace, value, result, expected_error_msg", + "to_replace,value", [ - ("b", "c", ["a", "c"], "Categorical.categories are different"), - ("c", "d", ["a", "b"], None), - # https://github.com/pandas-dev/pandas/issues/33288 - ("a", "a", ["a", "b"], None), - ("b", None, ["a", None], "Categorical.categories length are different"), + # one-to-one + (3, 5), + # many-to-one + ((3, 2), 5), ], ) -def test_replace_categorical(to_replace, value, result, expected_error_msg): - # GH#26988 - cat = Categorical(["a", "b"]) - expected = Categorical(result) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if expected_error_msg is not None else None - with tm.assert_produces_warning(warn, match=msg): - result = pd.Series(cat, copy=False).replace(to_replace, value)._values +def test_replace_categorical_series_new_category_raises(to_replace, value): + # GH 31720 + ser = pd.Series([1, 2, 3], dtype="category") + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + ser.replace(to_replace, value) - tm.assert_categorical_equal(result, expected) - if to_replace == "b": # the "c" test is supposed to be unchanged - with pytest.raises(AssertionError, match=expected_error_msg): - # ensure non-inplace call does not affect original - tm.assert_categorical_equal(cat, expected) - ser = pd.Series(cat, copy=False) - with tm.assert_produces_warning(warn, match=msg): - ser.replace(to_replace, value, inplace=True) - tm.assert_categorical_equal(cat, expected) +def test_replace_maintain_ordering(): + # GH51016 + dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) + ser = pd.Series([0, 1, 2], dtype=dtype) + result = ser.replace(0, 2) + expected = pd.Series([2, 1, 2], dtype=dtype) + tm.assert_series_equal(expected, result, check_category_order=True) def test_replace_categorical_ea_dtype(): # GH49404 - cat = Categorical(pd.array(["a", "b"], dtype="string")) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + cat = Categorical(pd.array(["a", "b", "c"], dtype="string")) + result = pd.Series(cat).replace(["a", "b"], ["c", "c"])._values + expected = Categorical( + pd.array(["c"] * 3, dtype="string"), + categories=pd.array(["a", "b", "c"], dtype="string"), ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values - expected = Categorical(pd.array(["c", pd.NA], dtype="string")) tm.assert_categorical_equal(result, expected) -def test_replace_maintain_ordering(): - # GH51016 - dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) - ser = pd.Series([0, 1, 2], dtype=dtype) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace(0, 2) - expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) - expected = pd.Series([2, 1, 2], dtype=expected_dtype) - tm.assert_series_equal(expected, result, check_category_order=True) +def test_replace_categorical_ea_dtype_different_cats_raises(): + # GH49404 + cat = Categorical(pd.array(["a", "b"], dtype="string")) + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + pd.Series(cat).replace(["a", "b"], ["c", pd.NA]) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 87eb7bcfa9cee..bd3298940ae3a 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -105,28 +105,36 @@ def test_accessor_raises(self): @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) - @pytest.mark.parametrize("dtype", ["float64", "int64"]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_from_spmatrix(self, format, labels, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) + sp_dtype = SparseDtype(dtype) - mat = sp_sparse.eye(10, format=format, dtype=dtype) - result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) + sp_mat = sp_sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels) + mat = np.eye(10, dtype=dtype) expected = pd.DataFrame( - np.eye(10, dtype=dtype), index=labels, columns=labels + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + index=labels, + columns=labels, ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) - def test_from_spmatrix_including_explicit_zero(self, format): + @pytest.mark.parametrize("dtype", [np.int64, bool]) + def test_from_spmatrix_including_explicit_zero(self, format, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - mat = sp_sparse.random(10, 2, density=0.5, format=format) - mat.data[0] = 0 - result = pd.DataFrame.sparse.from_spmatrix(mat) - dtype = SparseDtype("float64", 0.0) - expected = pd.DataFrame(mat.todense()).astype(dtype) + sp_dtype = SparseDtype(dtype) + + sp_mat = sp_sparse.random(10, 2, density=0.5, format=format, dtype=dtype) + sp_mat.data[0] = 0 + result = pd.DataFrame.sparse.from_spmatrix(sp_mat) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value) + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -136,41 +144,34 @@ def test_from_spmatrix_including_explicit_zero(self, format): def test_from_spmatrix_columns(self, columns): sp_sparse = pytest.importorskip("scipy.sparse") - dtype = SparseDtype("float64", 0.0) + sp_dtype = SparseDtype(np.float64) - mat = sp_sparse.random(10, 2, density=0.5) - result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) - expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) + sp_mat = sp_sparse.random(10, 2, density=0.5) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, columns=columns) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] + "columns", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] ) - def test_to_coo(self, colnames): + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) + def test_to_coo(self, columns, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - df = pd.DataFrame( - {colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]" - ) - result = df.sparse.to_coo() - expected = sp_sparse.coo_matrix(np.asarray(df)) - assert (result != expected).nnz == 0 + sp_dtype = SparseDtype(dtype) - @pytest.mark.parametrize("fill_value", [1, np.nan]) - def test_to_coo_nonzero_fill_val_raises(self, fill_value): - pytest.importorskip("scipy") - df = pd.DataFrame( - { - "A": SparseArray( - [fill_value, fill_value, fill_value, 2], fill_value=fill_value - ), - "B": SparseArray( - [fill_value, 2, fill_value, fill_value], fill_value=fill_value - ), - } - ) - with pytest.raises(ValueError, match="fill value must be 0"): - df.sparse.to_coo() + expected = sp_sparse.random(10, 2, density=0.5, format="coo", dtype=dtype) + mat = expected.toarray() + result = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + dtype=sp_dtype, + ).sparse.to_coo() + assert (result != expected).nnz == 0 def test_to_coo_midx_categorical(self): # GH#50996 diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 97d57163ed079..f7b76e7388ae9 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -1,9 +1,9 @@ import datetime import decimal +import zoneinfo import numpy as np import pytest -import pytz import pandas as pd import pandas._testing as tm @@ -285,9 +285,6 @@ def test_array_copy(): assert tm.shares_memory(a, b) -cet = pytz.timezone("CET") - - @pytest.mark.parametrize( "data, expected", [ @@ -326,11 +323,18 @@ def test_array_copy(): ), ( [ - datetime.datetime(2000, 1, 1, tzinfo=cet), - datetime.datetime(2001, 1, 1, tzinfo=cet), + datetime.datetime( + 2000, 1, 1, tzinfo=zoneinfo.ZoneInfo("Europe/Berlin") + ), + datetime.datetime( + 2001, 1, 1, tzinfo=zoneinfo.ZoneInfo("Europe/Berlin") + ), ], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="us") + ["2000", "2001"], + dtype=pd.DatetimeTZDtype( + tz=zoneinfo.ZoneInfo("Europe/Berlin"), unit="us" + ), ), ), # timedelta diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 63d60c78da482..de189b7e2f724 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -7,12 +7,6 @@ from datetime import timedelta import operator -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - import numpy as np import pytest @@ -724,21 +718,14 @@ def test_tz_localize_t2d(self): roundtrip = expected.tz_localize("US/Pacific") tm.assert_datetime_array_equal(roundtrip, dta) - easts = ["US/Eastern", "dateutil/US/Eastern"] - if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Eastern") - except KeyError: - # no tzdata - pass - else: - # Argument 1 to "append" of "list" has incompatible type "ZoneInfo"; - # expected "str" - easts.append(tz) # type: ignore[arg-type] - - @pytest.mark.parametrize("tz", easts) + @pytest.mark.parametrize( + "tz", ["US/Eastern", "dateutil/US/Eastern", "pytz/US/Eastern"] + ) def test_iter_zoneinfo_fold(self, tz): # GH#49684 + if tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) utc_vals = np.array( [1320552000, 1320555600, 1320559200, 1320562800], dtype=np.int64 ) @@ -773,7 +760,7 @@ def test_date_range_frequency_M_Q_Y_raises(self, freq): with pytest.raises(ValueError, match=msg): pd.date_range("1/1/2000", periods=4, freq=freq) - @pytest.mark.parametrize("freq_depr", ["2H", "2CBH", "2MIN", "2S", "2mS", "2Us"]) + @pytest.mark.parametrize("freq_depr", ["2MIN", "2mS", "2Us"]) def test_date_range_uppercase_frequency_deprecated(self, freq_depr): # GH#9586, GH#54939 depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " @@ -820,6 +807,13 @@ def test_date_range_frequency_A_raises(self, freq): with pytest.raises(ValueError, match=msg): pd.date_range("1/1/2000", periods=4, freq=freq) + @pytest.mark.parametrize("freq", ["2H", "2CBH", "2S"]) + def test_date_range_uppercase_frequency_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) + def test_factorize_sort_without_freq(): dta = DatetimeArray._from_sequence([0, 2, 1], dtype="M8[ns]") diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index d52f33fe80434..1844b47847e95 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -758,16 +758,25 @@ class TestTypeCasting: # maybe someday... numexpr has too many upcasting rules now # chain(*(np.core.sctypes[x] for x in ['uint', 'int', 'float'])) @pytest.mark.parametrize("left_right", [("df", "3"), ("3", "df")]) - def test_binop_typecasting(self, engine, parser, op, float_numpy_dtype, left_right): - df = DataFrame( - np.random.default_rng(2).standard_normal((5, 3)), dtype=float_numpy_dtype - ) + def test_binop_typecasting( + self, engine, parser, op, complex_or_float_dtype, left_right, request + ): + # GH#21374 + dtype = complex_or_float_dtype + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dtype) left, right = left_right s = f"{left} {op} {right}" res = pd.eval(s, engine=engine, parser=parser) - assert df.values.dtype == float_numpy_dtype - assert res.values.dtype == float_numpy_dtype - tm.assert_frame_equal(res, eval(s)) + if dtype == "complex64" and engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr issue with complex that are upcast " + "to complex 128 " + "https://github.com/pydata/numexpr/issues/492" + ) + request.applymarker(mark) + assert df.values.dtype == dtype + assert res.values.dtype == dtype + tm.assert_frame_equal(res, eval(s), check_exact=False) # ------------------------------------- diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index b10141b0d63f4..37a21e1098e78 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -725,15 +725,13 @@ def test_column_as_series_set_with_upcast(backend): with pytest.raises(TypeError, match="Invalid value"): s[0] = "foo" expected = Series([1, 2, 3], name="a") + tm.assert_series_equal(s, expected) + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) else: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): s[0] = "foo" - expected = Series(["foo", 2, 3], dtype=object, name="a") - - tm.assert_series_equal(s, expected) - tm.assert_frame_equal(df, df_orig) - # ensure cached series on getitem is not the changed series - tm.assert_series_equal(df["a"], df_orig["a"]) @pytest.mark.parametrize( @@ -805,16 +803,14 @@ def test_set_value_copy_only_necessary_column(indexer_func, indexer, val, col): view = df[:] if val == "a": - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype is deprecated" - ): + with pytest.raises(TypeError, match="Invalid value"): indexer_func(df)[indexer] = val + else: + indexer_func(df)[indexer] = val - indexer_func(df)[indexer] = val - - assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) - tm.assert_frame_equal(view, df_orig) + assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + tm.assert_frame_equal(view, df_orig) def test_series_midx_slice(): diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 3712a74fe54ed..6f0cbe12a2ea0 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1105,26 +1105,26 @@ def test_putmask_aligns_rhs_no_reference(dtype): assert np.shares_memory(arr_a, get_array(df, "a")) -@pytest.mark.parametrize( - "val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)] -) -def test_putmask_dont_copy_some_blocks(val, exp, warn): +@pytest.mark.parametrize("val, exp, raises", [(5.5, True, True), (5, False, False)]) +def test_putmask_dont_copy_some_blocks(val, exp, raises: bool): df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5}) view = df[:] df_orig = df.copy() indexer = DataFrame( [[True, False, False], [True, False, False]], columns=list("abc") ) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + df[indexer] = val + else: df[indexer] = val - - assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) - # TODO(CoW): Could split blocks to avoid copying the whole block - assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp - assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) - assert df._mgr._has_no_reference(1) is not exp - assert not df._mgr._has_no_reference(2) - tm.assert_frame_equal(view, df_orig) + assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) + # TODO(CoW): Could split blocks to avoid copying the whole block + assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp + assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) + assert df._mgr._has_no_reference(1) is not exp + assert not df._mgr._has_no_reference(2) + tm.assert_frame_equal(view, df_orig) @pytest.mark.parametrize("dtype", ["int64", "Int64"]) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 63254f1244a2e..2eb88923c0087 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -129,18 +129,14 @@ def test_replace_to_replace_wrong_dtype(): def test_replace_list_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) + + df.replace(["c"], value="a", inplace=True) assert np.shares_memory(arr.codes, get_array(df, "a").codes) assert df._mgr._has_no_reference(0) df_orig = df.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): - df2 = df.replace(["b"], value="a") + df.replace(["b"], value="a") + df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"})) assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) @@ -150,13 +146,7 @@ def test_replace_list_inplace_refs_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) - assert not np.shares_memory(get_array(view, "a").codes, get_array(df, "a").codes) + df.replace(["c"], value="a", inplace=True) tm.assert_frame_equal(df_orig, view) @@ -195,56 +185,34 @@ def test_replace_inplace_reference_no_op(to_replace): @pytest.mark.parametrize("to_replace", [1, [1]]) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace_reference(val, to_replace): +def test_replace_categorical_inplace_reference(to_replace): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df.replace(to_replace=to_replace, value=val, inplace=True) - + df.replace(to_replace=to_replace, value=1, inplace=True) assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) tm.assert_frame_equal(view, df_orig) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace(val): +def test_replace_categorical_inplace(): df = DataFrame({"a": Categorical([1, 2, 3])}) arr_a = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df.replace(to_replace=1, value=val, inplace=True) + df.replace(to_replace=1, value=1, inplace=True) assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) assert df._mgr._has_no_reference(0) - expected = DataFrame({"a": Categorical([val, 2, 3])}) + expected = DataFrame({"a": Categorical([1, 2, 3])}) tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical(val): +def test_replace_categorical(): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df2 = df.replace(to_replace=1, value=val) + df2 = df.replace(to_replace=1, value=1) assert df._mgr._has_no_reference(0) assert df2._mgr._has_no_reference(0) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index c6da01636247d..903c13587151a 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -3,7 +3,6 @@ import numpy as np import pytest -import pytz from pandas._libs.tslibs.dtypes import NpyDatetimeUnit @@ -391,8 +390,9 @@ def test_empty(self): def test_tz_standardize(self): # GH 24713 + pytz = pytest.importorskip("pytz") tz = pytz.timezone("US/Eastern") - dr = date_range("2013-01-01", periods=3, tz="US/Eastern") + dr = date_range("2013-01-01", periods=3, tz=tz) dtype = DatetimeTZDtype("ns", dr.tz) assert dtype.tz == tz dtype = DatetimeTZDtype("ns", dr[0].tz) @@ -1231,3 +1231,15 @@ def test_multi_column_dtype_assignment(): df["b"] = 0 tm.assert_frame_equal(df, expected) + + +def test_loc_setitem_empty_labels_no_dtype_conversion(): + # GH 29707 + + df = pd.DataFrame({"a": [2, 3]}) + expected = df.copy() + assert df.a.dtype == "int64" + df.loc[[]] = 0.1 + + assert df.a.dtype == "int64" + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index db18cd4aef14e..b1d7c701e1267 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -12,6 +12,7 @@ datetime, time, timedelta, + timezone, ) from decimal import Decimal from fractions import Fraction @@ -27,7 +28,6 @@ import numpy as np import pytest -import pytz from pandas._libs import ( lib, @@ -1022,7 +1022,7 @@ def test_maybe_convert_objects_itemsize(self, data0, data1): def test_mixed_dtypes_remain_object_array(self): # GH14956 - arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) + arr = np.array([datetime(2015, 1, 1, tzinfo=timezone.utc), 1], dtype=object) result = lib.maybe_convert_objects(arr, convert_non_numeric=True) tm.assert_numpy_array_equal(result, arr) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 2109c794ad44f..f86ed6f49759f 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -697,6 +697,9 @@ def test_array_equivalent_index_with_tuples(): ("f2", np.nan), ("f4", np.nan), ("f8", np.nan), + # Complex + ("c8", np.nan), + ("c16", np.nan), # Object ("O", np.nan), # Interval diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b951d4c35d208..b7f0f973e640a 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -116,10 +116,8 @@ def test_argsort_missing_array(self, data_missing_for_sorting): tm.assert_numpy_array_equal(result, expected) def test_argsort_missing(self, data_missing_for_sorting): - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.Series(data_missing_for_sorting).argsort() - expected = pd.Series(np.array([1, -1, 0], dtype=np.intp)) + result = pd.Series(data_missing_for_sorting).argsort() + expected = pd.Series(np.array([2, 0, 1], dtype=np.intp)) tm.assert_series_equal(result, expected) def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5926d23b44dd0..4fad5e45409b9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2437,13 +2437,13 @@ def test_unsupported_dt(data): ["hour", 3], ["minute", 4], ["is_leap_year", False], - ["microsecond", 5], + ["microsecond", 2000], ["month", 1], ["nanosecond", 6], ["quarter", 1], ["second", 7], ["date", date(2023, 1, 2)], - ["time", time(3, 4, 7, 5)], + ["time", time(3, 4, 7, 2000)], ], ) def test_dt_properties(prop, expected): @@ -2456,7 +2456,7 @@ def test_dt_properties(prop, expected): hour=3, minute=4, second=7, - microsecond=5, + microsecond=2000, nanosecond=6, ), None, @@ -2473,6 +2473,28 @@ def test_dt_properties(prop, expected): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("microsecond", [2000, 5, 0]) +def test_dt_microsecond(microsecond): + # GH 59183 + ser = pd.Series( + [ + pd.Timestamp( + year=2024, + month=7, + day=7, + second=5, + microsecond=microsecond, + nanosecond=6, + ), + None, + ], + dtype=ArrowDtype(pa.timestamp("ns")), + ) + result = ser.dt.microsecond + expected = pd.Series([microsecond, None], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) + + def test_dt_is_month_start_end(): ser = pd.Series( [ @@ -2905,6 +2927,31 @@ def test_dt_components(): tm.assert_frame_equal(result, expected) +def test_dt_components_large_values(): + ser = pd.Series( + [ + pd.Timedelta("365 days 23:59:59.999000"), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[365, 23, 59, 59, 999, 0, 0], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624 diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 35e143fcedf7b..5be42d41af03a 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -1,10 +1,12 @@ from collections.abc import Iterator -from datetime import datetime +from datetime import ( + datetime, + timezone, +) from decimal import Decimal import numpy as np import pytest -import pytz from pandas._config import using_pyarrow_string_dtype @@ -239,7 +241,7 @@ def test_from_records_series_categorical_index(self): tm.assert_frame_equal(frame, expected) def test_frame_from_records_utc(self): - rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)} + rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=timezone.utc)} # it works DataFrame.from_records([rec], index="begin_time") diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index f55605d1ffa12..472bfb7772a80 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -49,35 +49,19 @@ def test_loc_setitem_multiindex_columns(self, consolidate): def test_37477(): # fixed by GH#45121 orig = DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) - expected = DataFrame({"A": [1, 2, 3], "B": [3, 1.2, 5]}) df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.at[1, "B"] = 1.2 - tm.assert_frame_equal(df, expected) - df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[1, "B"] = 1.2 - tm.assert_frame_equal(df, expected) - df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.iat[1, 1] = 1.2 - tm.assert_frame_equal(df, expected) - df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[1, 1] = 1.2 - tm.assert_frame_equal(df, expected) def test_6942(indexer_al): @@ -107,19 +91,11 @@ def test_26395(indexer_al): expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): indexer_al(df)["C", "D"] = 44.5 - expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64) - tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): indexer_al(df)["C", "D"] = "hello" - expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object) - tm.assert_frame_equal(df, expected) @pytest.mark.xfail(reason="unwanted upcast") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 9cd2c2515f49a..693075a881833 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -25,7 +25,6 @@ Timestamp, date_range, isna, - notna, to_datetime, ) import pandas._testing as tm @@ -833,13 +832,8 @@ def test_setitem_single_column_mixed_datetime(self): tm.assert_series_equal(result, expected) # GH#16674 iNaT is treated as an integer when given by the user - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc["b", "timestamp"] = iNaT - assert not isna(df.loc["b", "timestamp"]) - assert df["timestamp"].dtype == np.object_ - assert df.loc["b", "timestamp"] == iNaT # allow this syntax (as of GH#3216) df.loc["c", "timestamp"] = np.nan @@ -851,35 +845,11 @@ def test_setitem_single_column_mixed_datetime(self): def test_setitem_mixed_datetime(self): # GH 9336 - expected = DataFrame( - { - "a": [0, 0, 0, 0, 13, 14], - "b": [ - datetime(2012, 1, 1), - 1, - "x", - "y", - datetime(2013, 1, 1), - datetime(2014, 1, 1), - ], - } - ) df = DataFrame(0, columns=list("ab"), index=range(6)) df["b"] = pd.NaT df.loc[0, "b"] = datetime(2012, 1, 1) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[1, "b"] = 1 - df.loc[[2, 3], "b"] = "x", "y" - A = np.array( - [ - [13, np.datetime64("2013-01-01T00:00:00")], - [14, np.datetime64("2014-01-01T00:00:00")], - ] - ) - df.loc[[4, 5], ["a", "b"]] = A - tm.assert_frame_equal(df, expected) def test_setitem_frame_float(self, float_frame): piece = float_frame.loc[float_frame.index[:2], ["A", "B"]] @@ -936,8 +906,12 @@ def test_setitem_frame_upcast(self): # needs upcasting df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"]) df2 = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 + # Manually upcast so we can add .5 + df = df.astype({"A": "float64", "B": "float64"}) + df2 = df2.astype({"A": "float64", "B": "float64"}) + df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 expected = df.reindex(columns=["A", "B"]) expected += 0.5 expected["C"] = df["C"] @@ -1366,12 +1340,8 @@ def test_loc_setitem_rhs_frame(self, idxr, val): # GH#47578 df = DataFrame({"a": [1, 2]}) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, idxr] = DataFrame({"a": [val, 11]}, index=[1, 2]) - expected = DataFrame({"a": [np.nan, val]}) - tm.assert_frame_equal(df, expected) def test_iloc_setitem_enlarge_no_warning(self): # GH#47381 @@ -1579,18 +1549,9 @@ def test_setitem(self): # With NaN: because uint64 has no NaN element, # the column should be cast to object. df2 = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df2.iloc[1, 1] = pd.NaT df2.iloc[1, 2] = pd.NaT - result = df2["B"] - tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) - tm.assert_series_equal( - df2.dtypes, - Series( - [np.dtype("uint64"), np.dtype("O"), np.dtype("O")], - index=["A", "B", "C"], - ), - ) def test_object_casting_indexing_wraps_datetimelike(): @@ -1925,23 +1886,30 @@ def test_add_new_column_infer_string(): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py - # but checks for warnings instead of errors. - def _check_setitem_invalid(self, df, invalid, indexer, warn): - msg = "Setting an item of incompatible dtype is deprecated" - msg = re.escape(msg) - + def _check_setitem_invalid(self, df, invalid, indexer): orig_df = df.copy() # iloc - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[indexer, 0] = invalid df = orig_df.copy() # loc - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): df.loc[indexer, "a"] = invalid df = orig_df.copy() + def _check_setitem_valid(self, df, value, indexer): + orig_df = df.copy() + + # iloc + df.iloc[indexer, 0] = value + df = orig_df.copy() + + # loc + df.loc[indexer, "a"] = value + df = orig_df.copy() + _invalid_scalars = [ 1 + 2j, "True", @@ -1959,20 +1927,19 @@ def _check_setitem_invalid(self, df, invalid, indexer, warn): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): df = DataFrame({"a": [True, False, False]}, dtype="bool") - self._check_setitem_invalid(df, invalid, indexer, FutureWarning) + self._check_setitem_invalid(df, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) if isna(invalid) and invalid is not pd.NaT and not np.isnat(invalid): - warn = None + self._check_setitem_valid(df, invalid, indexer) else: - warn = FutureWarning - self._check_setitem_invalid(df, invalid, indexer, warn) + self._check_setitem_invalid(df, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype) - self._check_setitem_invalid(df, invalid, indexer, FutureWarning) + self._check_setitem_invalid(df, invalid, indexer) diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py index 264e27c9c122e..ac6f0a1ac0f73 100644 --- a/pandas/tests/frame/indexing/test_mask.py +++ b/pandas/tests/frame/indexing/test_mask.py @@ -122,7 +122,7 @@ def test_mask_stringdtype(frame_or_series): def test_mask_where_dtype_timedelta(): # https://github.com/pandas-dev/pandas/issues/39548 - df = DataFrame([Timedelta(i, unit="d") for i in range(5)]) + df = DataFrame([Timedelta(i, unit="D") for i in range(5)]) expected = DataFrame(np.full(5, np.nan, dtype="timedelta64[ns]")) tm.assert_frame_equal(df.mask(df.notna()), expected) @@ -130,7 +130,7 @@ def test_mask_where_dtype_timedelta(): expected = DataFrame( [np.nan, np.nan, np.nan, Timedelta("3 day"), Timedelta("4 day")] ) - tm.assert_frame_equal(df.where(df > Timedelta(2, unit="d")), expected) + tm.assert_frame_equal(df.where(df > Timedelta(2, unit="D")), expected) def test_mask_return_dtype(): diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index ce771280bc264..aaf95daf232e2 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas.core.dtypes.common import is_float_dtype @@ -6,7 +7,6 @@ DataFrame, isna, ) -import pandas._testing as tm class TestSetValue: @@ -40,11 +40,8 @@ def test_set_value_resize(self, float_frame, using_infer_string): assert is_float_dtype(res["baz"]) assert isna(res["baz"].drop(["foobar"])).all() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): res._set_value("foobar", "baz", "sam") - assert res.loc["foobar", "baz"] == "sam" def test_set_value_with_index_dtype_change(self): df_orig = DataFrame( diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 15cdc6566b570..df3b058ca51f9 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1356,18 +1356,12 @@ def test_frame_setitem_empty_dataframe(self): def test_full_setter_loc_incompatible_dtype(): # https://github.com/pandas-dev/pandas/issues/55791 df = DataFrame({"a": [1, 2]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "a"] = True - expected = DataFrame({"a": [True, True]}) - tm.assert_frame_equal(df, expected) - df = DataFrame({"a": [1, 2]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "a"] = {0: 3.5, 1: 4.5} - expected = DataFrame({"a": [3.5, 4.5]}) - tm.assert_frame_equal(df, expected) - df = DataFrame({"a": [1, 2]}) df.loc[:, "a"] = {0: 3, 1: 4} expected = DataFrame({"a": [3, 4]}) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index aeffc4835a347..0f22ff52d5212 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -513,26 +513,15 @@ def test_where_axis_with_upcast(self): tm.assert_frame_equal(result, expected) result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, ser, axis="index", inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Invalid value"): + result.where(mask, ser, axis="index", inplace=True) expected = DataFrame([[0, np.nan], [0, np.nan]]) result = df.where(mask, ser, axis="columns") tm.assert_frame_equal(result, expected) - expected = DataFrame( - { - 0: np.array([0, 0], dtype="int64"), - 1: np.array([np.nan, np.nan], dtype="float64"), - } - ) - result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, ser, axis="columns", inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Invalid value"): + df.where(mask, ser, axis="columns", inplace=True) def test_where_axis_multiple_dtypes(self): # Multiple dtypes (=> multiple Blocks) @@ -584,15 +573,10 @@ def test_where_axis_multiple_dtypes(self): result = df.where(mask, d1, axis="index") tm.assert_frame_equal(result, expected) result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, d1, inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) - result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): + result.where(mask, d1, inplace=True) + with pytest.raises(TypeError, match="Invalid value"): return_value = result.where(mask, d1, inplace=True, axis="index") - assert return_value is None - tm.assert_frame_equal(result, expected) d2 = df.copy().drop(1, axis=1) expected = df.copy() @@ -739,11 +723,8 @@ def test_where_interval_fullop_downcast(self, frame_or_series): res = obj.where(~obj.notna(), other) tm.assert_equal(res, other) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj.mask(obj.notna(), other, inplace=True) - tm.assert_equal(obj, other.astype(object)) @pytest.mark.parametrize( "dtype", @@ -773,14 +754,10 @@ def test_where_datetimelike_noop(self, dtype): res4 = df.mask(mask2, "foo") tm.assert_frame_equal(res4, df) - expected = DataFrame(4, index=df.index, columns=df.columns) # unlike where, Block.putmask does not downcast - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.mask(~mask2, 4, inplace=True) - tm.assert_frame_equal(df, expected.astype(object)) def test_where_int_downcasting_deprecated(): @@ -934,11 +911,8 @@ def test_where_period_invalid_na(frame_or_series, as_cat, request): result = obj.mask(mask, tdnat) tm.assert_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj.mask(mask, tdnat, inplace=True) - tm.assert_equal(obj, expected) def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): @@ -1020,9 +994,7 @@ def test_where_dt64_2d(): "B": dta[:, 1], } ) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): _check_where_equivalences(df, mask, other, expected) # setting nothing in either column diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 55f8052d05cf1..edc90ce77ad3a 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -149,7 +149,7 @@ def test_astype_str(self): # see GH#9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) - c = Series([Timedelta(x, unit="d") for x in range(5)]) + c = Series([Timedelta(x, unit="D") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) @@ -715,8 +715,12 @@ def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): df.astype(float, errors=errors) def test_astype_tz_conversion(self): - # GH 35973 - val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + # GH 35973, GH#58998 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + val = { + "tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London") + } df = DataFrame(val) result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"}) @@ -727,7 +731,7 @@ def test_astype_tz_conversion(self): @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"]) def test_astype_tz_object_conversion(self, tz): # GH 35973 - val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + val = {"tz": date_range("2020-08-30", freq="D", periods=2, tz="Europe/London")} expected = DataFrame(val) # convert expected to object dtype from other tz str (independently tested) diff --git a/pandas/tests/frame/methods/test_at_time.py b/pandas/tests/frame/methods/test_at_time.py index 126899826fac3..b69db80dee446 100644 --- a/pandas/tests/frame/methods/test_at_time.py +++ b/pandas/tests/frame/methods/test_at_time.py @@ -1,8 +1,11 @@ -from datetime import time +from datetime import ( + time, + timezone, +) +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs.tslibs import timezones @@ -65,7 +68,7 @@ def test_at_time_nonexistent(self, frame_or_series): assert len(rs) == 0 @pytest.mark.parametrize( - "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)] + "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=timezone.utc)] ) def test_at_time_errors(self, hour): # GH#24043 @@ -83,7 +86,7 @@ def test_at_time_tz(self): # GH#24043 dti = date_range("2018", periods=3, freq="h", tz="US/Pacific") df = DataFrame(list(range(len(dti))), index=dti) - result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) + result = df.at_time(time(4, tzinfo=zoneinfo.ZoneInfo("US/Eastern"))) expected = df.iloc[1:2] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 82802dd6e99eb..7de87e633cfb1 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -1,4 +1,5 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest @@ -543,17 +544,14 @@ def test_merge_join_different_levels_raises(self): df1.join(df2, on="a") def test_frame_join_tzaware(self): + tz = zoneinfo.ZoneInfo("US/Central") test1 = DataFrame( np.zeros((6, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=6, freq="100ms", tz="US/Central" - ), + index=date_range("2012-11-15 00:00:00", periods=6, freq="100ms", tz=tz), ) test2 = DataFrame( np.zeros((3, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=3, freq="250ms", tz="US/Central" - ), + index=date_range("2012-11-15 00:00:00", periods=3, freq="250ms", tz=tz), columns=range(3, 6), ) @@ -561,4 +559,4 @@ def test_frame_join_tzaware(self): expected = test1.index.union(test2.index) tm.assert_index_equal(result.index, expected) - assert result.index.tz.zone == "US/Central" + assert result.index.tz.key == "US/Central" diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 32ae4c0ff2f50..f35b77da0b547 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -710,14 +710,14 @@ def test_quantile_empty_no_columns(self, interp_method): result = df.quantile( 0.5, numeric_only=True, interpolation=interpolation, method=method ) - expected = Series([], index=[], name=0.5, dtype=np.float64) + expected = Series([], name=0.5, dtype=np.float64) expected.index.name = "captain tightpants" tm.assert_series_equal(result, expected) result = df.quantile( [0.5], numeric_only=True, interpolation=interpolation, method=method ) - expected = DataFrame([], index=[0.5], columns=[]) + expected = DataFrame([], index=[0.5]) expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) @@ -926,3 +926,12 @@ def test_datelike_numeric_only(self, expected_data, expected_index, axis): expected_data, name=0.5, index=Index(expected_index), dtype=np.float64 ) tm.assert_series_equal(result, expected) + + +def test_multi_quantile_numeric_only_retains_columns(): + df = DataFrame(list("abc")) + result = df.quantile([0.5, 0.7], numeric_only=True) + expected = DataFrame(index=[0.5, 0.7]) + tm.assert_frame_equal( + result, expected, check_index_type=True, check_column_type=True + ) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 45109991c4553..37adc31fb0f4d 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -754,7 +754,10 @@ def test_reindex_axes(self): index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], columns=["a", "b", "c"], ) - time_freq = date_range("2012-01-01", "2012-01-03", freq="d") + + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + time_freq = date_range("2012-01-01", "2012-01-03", freq="d") some_cols = ["a", "b"] index_freq = df.reindex(index=time_freq).index.freq diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index fb7ba2b7af38a..3fcc4aaa6960f 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1171,38 +1171,6 @@ def test_replace_with_empty_dictlike(self, mix_abc): tm.assert_frame_equal(df, df.replace({"b": {}})) tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) - @pytest.mark.parametrize( - "replace_dict, final_data", - [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], - ) - def test_categorical_replace_with_dict(self, replace_dict, final_data): - # GH 26988 - df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") - - final_data = np.array(final_data) - - a = pd.Categorical(final_data[:, 0], categories=[3, 2]) - - ex_cat = [3, 2] if replace_dict["b"] == 1 else [1, 3] - b = pd.Categorical(final_data[:, 1], categories=ex_cat) - - expected = DataFrame({"a": a, "b": b}) - msg2 = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = df.replace(replace_dict, 3) - tm.assert_frame_equal(result, expected) - msg = ( - r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " - "different" - ) - with pytest.raises(AssertionError, match=msg): - # ensure non-inplace call does not affect original - tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning(FutureWarning, match=msg2): - return_value = df.replace(replace_dict, 3, inplace=True) - assert return_value is None - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "df, to_replace, exp", [ @@ -1300,6 +1268,30 @@ def test_replace_ea_ignore_float(self, frame_or_series, value): result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[2, 2], [2, 2]]), ({"a": 1, "b": 2}, [[2, 1], [2, 2]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + + final_data = np.array(final_data) + + a = pd.Categorical(final_data[:, 0], categories=[1, 2]) + b = pd.Categorical(final_data[:, 1], categories=[1, 2]) + + expected = DataFrame({"a": a, "b": b}) + result = df.replace(replace_dict, 2) + tm.assert_frame_equal(result, expected) + msg = r"DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " "different" + with pytest.raises(AssertionError, match=msg): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + return_value = df.replace(replace_dict, 2, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1345,15 +1337,17 @@ def test_replace_value_category_type(self): ) # replace values in input dataframe - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"d": "z"}) + ) + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"obj1": "obj9"}) + ) + result = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"cat2": "catX"}) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - input_df = input_df.replace("d", "z") - input_df = input_df.replace("obj1", "obj9") - result = input_df.replace("cat2", "catX") + result = result.astype({"col1": "int64", "col3": "float64", "col5": "object"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1378,12 +1372,11 @@ def test_replace_dict_category_type(self): ) # replace values in input dataframe using a dict - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + result = input_df.apply( + lambda x: x.cat.rename_categories( + {"a": "z", "obj1": "obj9", "cat1": "catX"} + ) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 22ce091d4ed62..980dd5243daa5 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -600,8 +600,8 @@ def test_reset_index_with_drop( {"a": [pd.NaT, Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]}, ), ( - [(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)], - {"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]}, + [(pd.NaT, 1), (pd.Timedelta(123, "D"), 2)], + {"a": [pd.NaT, pd.Timedelta(123, "D")], "b": [1, 2], "x": [11, 12]}, ), ], ) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 0272b679e85a2..c43d947b4877e 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -2,11 +2,13 @@ OrderedDict, defaultdict, ) -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest -import pytz from pandas import ( NA, @@ -209,15 +211,15 @@ def test_to_dict_tz(self): # GH#18372 When converting to dict with orient='records' columns of # datetime that are tz-aware were not converted to required arrays data = [ - (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), - (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=timezone.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=timezone.utc),), ] df = DataFrame(list(data), columns=["d"]) result = df.to_dict(orient="records") expected = [ - {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)}, - {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)}, + {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=timezone.utc)}, + {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=timezone.utc)}, ] tm.assert_dict_equal(result[0], expected[0]) tm.assert_dict_equal(result[1], expected[1]) diff --git a/pandas/tests/frame/methods/test_tz_convert.py b/pandas/tests/frame/methods/test_tz_convert.py index e9209f218bca9..5ee4021102f22 100644 --- a/pandas/tests/frame/methods/test_tz_convert.py +++ b/pandas/tests/frame/methods/test_tz_convert.py @@ -1,3 +1,5 @@ +import zoneinfo + import numpy as np import pytest @@ -13,28 +15,34 @@ class TestTZConvert: def test_tz_convert(self, frame_or_series): - rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + rng = date_range( + "1/1/2011", periods=200, freq="D", tz=zoneinfo.ZoneInfo("US/Eastern") + ) obj = DataFrame({"a": 1}, index=rng) obj = tm.get_obj(obj, frame_or_series) - result = obj.tz_convert("Europe/Berlin") - expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + berlin = zoneinfo.ZoneInfo("Europe/Berlin") + result = obj.tz_convert(berlin) + expected = DataFrame({"a": 1}, rng.tz_convert(berlin)) expected = tm.get_obj(expected, frame_or_series) - assert result.index.tz.zone == "Europe/Berlin" + assert result.index.tz.key == "Europe/Berlin" tm.assert_equal(result, expected) def test_tz_convert_axis1(self): - rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + rng = date_range( + "1/1/2011", periods=200, freq="D", tz=zoneinfo.ZoneInfo("US/Eastern") + ) obj = DataFrame({"a": 1}, index=rng) obj = obj.T - result = obj.tz_convert("Europe/Berlin", axis=1) - assert result.columns.tz.zone == "Europe/Berlin" + berlin = zoneinfo.ZoneInfo("Europe/Berlin") + result = obj.tz_convert(berlin, axis=1) + assert result.columns.tz.key == "Europe/Berlin" - expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + expected = DataFrame({"a": 1}, rng.tz_convert(berlin)) tm.assert_equal(result, expected.T) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 269b9e372bd70..ea63b2264d4f6 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -152,18 +152,9 @@ def test_update_with_different_dtype(self): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.update({"c": Series(["foo"], index=[0])}) - expected = DataFrame( - { - "a": [1, 3], - "b": [np.nan, 2], - "c": Series(["foo", np.nan], dtype="object"), - } - ) - tm.assert_frame_equal(df, expected) - def test_update_modify_view(self, using_infer_string): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index c68171ab254c7..b4c16b94fcf8b 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1,6 +1,7 @@ -from datetime import datetime - -import pytz +from datetime import ( + datetime, + timezone, +) from pandas import DataFrame import pandas._testing as tm @@ -13,7 +14,7 @@ def test_set_axis_setattr_index(self): # GH 6785 # set the index manually - df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=pytz.utc), "foo": 1}]) + df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=timezone.utc), "foo": 1}]) expected = df.set_index("ts") df.index = df["ts"] df.pop("ts") diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index da0504458cf5d..2d5772eb5cb53 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -14,12 +14,12 @@ ) import functools import re +import zoneinfo import numpy as np from numpy import ma from numpy.ma import mrecords import pytest -import pytz from pandas._config import using_pyarrow_string_dtype @@ -1908,8 +1908,7 @@ def test_constructor_with_datetimes2(self): def test_constructor_with_datetimes3(self): # GH 7594 # don't coerce tz-aware - tz = pytz.timezone("US/Eastern") - dt = tz.localize(datetime(2012, 1, 1)) + dt = datetime(2012, 1, 1, tzinfo=zoneinfo.ZoneInfo("US/Eastern")) df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt @@ -2523,11 +2522,13 @@ def check_views(c_only: bool = False): check_views() # TODO: most of the rest of this test belongs in indexing tests - if lib.is_np_dtype(df.dtypes.iloc[0], "fciuO"): - warn = None + should_raise = not lib.is_np_dtype(df.dtypes.iloc[0], "fciuO") + if should_raise: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[0, 0] = 0 + df.iloc[0, 1] = 0 + return else: - warn = FutureWarning - with tm.assert_produces_warning(warn, match="incompatible dtype"): df.iloc[0, 0] = 0 df.iloc[0, 1] = 0 if not copy: diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 643d342b052a4..b791868b173e4 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -202,6 +202,25 @@ def test_eval_simple(self, engine, parser): expected = df["a"] tm.assert_series_equal(expected, res) + def test_extension_array_eval(self, engine, parser, request): + # GH#58748 + if engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr does not support extension array dtypes" + ) + request.applymarker(mark) + df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])}) + result = df.eval("a / b", engine=engine, parser=parser) + expected = Series(pd.array([0.25, 0.40, 0.50])) + tm.assert_series_equal(result, expected) + + def test_complex_eval(self, engine, parser): + # GH#21374 + df = DataFrame({"a": [1 + 2j], "b": [1 + 1j]}) + result = df.eval("a/b", engine=engine, parser=parser) + expected = Series([1.5 + 0.5j]) + tm.assert_series_equal(result, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): @@ -744,7 +763,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture df_index = date_range( - start="2019-01-01", freq="1d", periods=10, tz=tz, name="time" + start="2019-01-01", freq="1D", periods=10, tz=tz, name="time" ) expected = DataFrame(index=df_index) df = DataFrame(index=df_index) diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index f6e0251d52de1..f799495d8025a 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -38,10 +38,10 @@ def test_repr_should_return_str(self): index1 = ["\u03c3", "\u03c4", "\u03c5", "\u03c6"] cols = ["\u03c8"] df = DataFrame(data, columns=cols, index=index1) - assert type(df.__repr__()) is str # noqa: E721 + assert type(df.__repr__()) is str ser = df[cols[0]] - assert type(ser.__repr__()) is str # noqa: E721 + assert type(ser.__repr__()) is str def test_repr_bytes_61_lines(self): # GH#12857 diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 0b607d91baf65..b591b1b1092d4 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -93,8 +93,7 @@ def test_get_numeric_data(self, frame_or_series): if isinstance(o, DataFrame): # preserve columns dtype expected.columns = o.columns[:0] - # https://github.com/pandas-dev/pandas/issues/50862 - tm.assert_equal(result.reset_index(drop=True), expected) + tm.assert_equal(result, expected) # get the bool data arr = np.array([True, True, False, True]) @@ -102,6 +101,11 @@ def test_get_numeric_data(self, frame_or_series): result = o._get_numeric_data() tm.assert_equal(result, o) + def test_get_bool_data_empty_preserve_index(self): + expected = Series([], dtype="bool") + result = expected._get_bool_data() + tm.assert_series_equal(result, expected, check_index_type=True) + def test_nonzero(self, frame_or_series): # GH 4633 # look at the boolean/nonzero behavior for objects diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index 491f621783a76..d8401a8b2ae3f 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -9,7 +9,6 @@ date_range, ) import pandas._testing as tm -from pandas.util.version import Version pytest.importorskip("xarray") @@ -30,17 +29,11 @@ def df(self): } ) - def test_to_xarray_index_types(self, index_flat, df, using_infer_string, request): + def test_to_xarray_index_types(self, index_flat, df, using_infer_string): index = index_flat # MultiIndex is tested in test_to_xarray_with_multiindex if len(index) == 0: pytest.skip("Test doesn't make sense for empty index") - import xarray - - if Version(xarray.__version__) >= Version("2024.5"): - request.applymarker( - pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/9026") - ) from xarray import Dataset diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4c1dc8953580a..13fb9cfc4c0e4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -148,8 +148,8 @@ def test_len_nan_group(): def test_groupby_timedelta_median(): # issue 57926 - expected = Series(data=Timedelta("1d"), index=["foo"]) - df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1d")]}) + expected = Series(data=Timedelta("1D"), index=["foo"]) + df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1D")]}) gb = df.groupby("label")["timedelta"] actual = gb.median() tm.assert_series_equal(actual, expected, check_names=False) @@ -2445,7 +2445,7 @@ def test_rolling_wrong_param_min_period(): test_df.columns = ["name", "val"] result_error_msg = ( - r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'$" + r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'" ) with pytest.raises(TypeError, match=result_error_msg): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() @@ -2985,3 +2985,14 @@ def test_groupby_agg_namedagg_with_duplicate_columns(): ) tm.assert_frame_equal(result, expected) + + +def test_groupby_multi_index_codes(): + # GH#54347 + df = DataFrame( + {"A": [1, 2, 3, 4], "B": [1, float("nan"), 2, float("nan")], "C": [2, 4, 6, 8]} + ) + df_grouped = df.groupby(["A", "B"], dropna=False).sum() + + index = df_grouped.index + tm.assert_index_equal(index, MultiIndex.from_frame(index.to_frame())) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 4749e845a0e59..cedbd577da0ca 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -420,7 +420,7 @@ def test_groupby_drop_nan_with_multi_index(): ), ), "datetime64[ns]", - "period[d]", + "period[D]", "Sparse[float]", ], ) @@ -437,7 +437,7 @@ def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index): # Unique values to use for grouper, depends on dtype if dtype in ("string", "string[pyarrow]"): uniques = {"x": "x", "y": "y", "z": pd.NA} - elif dtype in ("datetime64[ns]", "period[d]"): + elif dtype in ("datetime64[ns]", "period[D]"): uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA} else: uniques = {"x": 1, "y": 2, "z": np.nan} diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index afbc64429e93c..7e7c84fa2b390 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -291,8 +291,7 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): [ "not allowed for this dtype", "cannot be performed against 'object' dtypes", - # On PY39 message is "a number"; on PY310 and after is "a real number" - "must be a string or a.* number", + "must be a string or a real number", "unsupported operand type", "function is not implemented for this dtype", re.escape(f"agg function failed [how->{kernel},dtype->object]"), diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index edc94b2beeec1..00438c2100bad 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -982,7 +982,7 @@ def test_groupby_sum_timedelta_with_nat(): df = DataFrame( { "a": [1, 1, 2, 2], - "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + "b": [pd.Timedelta("1D"), pd.Timedelta("2D"), pd.Timedelta("3D"), pd.NaT], } ) td3 = pd.Timedelta(days=3) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index ea556d043be2d..44e8e050cb756 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -5,11 +5,11 @@ from datetime import ( datetime, timedelta, + timezone, ) import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -774,12 +774,12 @@ def test_groupby_with_timezone_selection(self): def test_timezone_info(self): # see gh-11682: Timezone info lost when broadcasting # scalar datetime to DataFrame - - df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) - assert df["b"][0].tzinfo == pytz.utc + utc = timezone.utc + df = DataFrame({"a": [1], "b": [datetime.now(utc)]}) + assert df["b"][0].tzinfo == utc df = DataFrame({"a": [1, 2, 3]}) - df["b"] = datetime.now(pytz.utc) - assert df["b"][0].tzinfo == pytz.utc + df["b"] = datetime.now(utc) + assert df["b"][0].tzinfo == utc def test_datetime_count(self): df = DataFrame( diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 726c57081373c..a189d6772ece4 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1591,3 +1591,12 @@ def test_min_one_dim_no_type_coercion(): expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32") tm.assert_frame_equal(expected, result) + + +def test_nan_in_cumsum_group_label(): + # GH#58811 + df = DataFrame({"A": [1, None], "B": [2, 3]}, dtype="Int16") + gb = df.groupby("A")["B"] + result = gb.cumsum() + expected = Series([2, None], dtype="Int16", name="B") + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/indexes/datetimelike_/test_indexing.py b/pandas/tests/indexes/datetimelike_/test_indexing.py index ee7128601256a..7b2c81aaf17de 100644 --- a/pandas/tests/indexes/datetimelike_/test_indexing.py +++ b/pandas/tests/indexes/datetimelike_/test_indexing.py @@ -19,7 +19,7 @@ @pytest.mark.parametrize("ldtype", dtlike_dtypes) @pytest.mark.parametrize("rdtype", dtlike_dtypes) def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): - vals = np.tile(3600 * 10**9 * np.arange(3), 2) + vals = np.tile(3600 * 10**9 * np.arange(3, dtype=np.int64), 2) def construct(dtype): if dtype is dtlike_dtypes[-1]: diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index c0bc6601769b1..81dc3b3ecc45e 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -3,7 +3,6 @@ import dateutil import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -251,6 +250,8 @@ def _check_rng(rng): _check_rng(rng_utc) def test_index_convert_to_datetime_array_explicit_pytz(self): + pytz = pytest.importorskip("pytz") + def _check_rng(rng): converted = rng.to_pydatetime() assert isinstance(converted, np.ndarray) diff --git a/pandas/tests/indexes/datetimes/methods/test_insert.py b/pandas/tests/indexes/datetimes/methods/test_insert.py index ebfe490e0e067..4a5b7bcc1a86f 100644 --- a/pandas/tests/indexes/datetimes/methods/test_insert.py +++ b/pandas/tests/indexes/datetimes/methods/test_insert.py @@ -1,8 +1,8 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest -import pytz from pandas import ( NA, @@ -133,49 +133,59 @@ def test_insert3(self, unit): assert result.name == expected.name assert result.freq is None - def test_insert4(self, unit): - for tz in ["US/Pacific", "Asia/Singapore"]: - idx = date_range( - "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit - ) - # preserve freq - expected = date_range( - "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx", unit=unit - ) - for d in [ - Timestamp("2000-01-01 15:00", tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 15)), - ]: - result = idx.insert(6, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - assert result.tz == expected.tz - - expected = DatetimeIndex( - [ - "2000-01-01 09:00", - "2000-01-01 10:00", - "2000-01-01 11:00", - "2000-01-01 12:00", - "2000-01-01 13:00", - "2000-01-01 14:00", - "2000-01-01 10:00", - ], - name="idx", - tz=tz, - freq=None, - ).as_unit(unit) - # reset freq to None - for d in [ - Timestamp("2000-01-01 10:00", tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 10)), - ]: - result = idx.insert(6, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.tz == expected.tz - assert result.freq is None + @pytest.mark.parametrize("tz", ["US/Pacific", "Asia/Singapore"]) + @pytest.mark.parametrize( + "to_ts", + [lambda x: x, lambda x: x.to_pydatetime()], + ids=["Timestamp", "datetime"], + ) + def test_insert4(self, unit, tz, to_ts): + idx = date_range( + "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit + ) + # preserve freq + expected = date_range( + "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx", unit=unit + ) + tz = zoneinfo.ZoneInfo(tz) + d = to_ts(Timestamp("2000-01-01 15:00", tz=tz)) + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + @pytest.mark.parametrize("tz", ["US/Pacific", "Asia/Singapore"]) + @pytest.mark.parametrize( + "to_ts", + [lambda x: x, lambda x: x.to_pydatetime()], + ids=["Timestamp", "datetime"], + ) + def test_insert4_no_freq(self, unit, tz, to_ts): + idx = date_range( + "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit + ) + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-01 14:00", + "2000-01-01 10:00", + ], + name="idx", + tz=tz, + freq=None, + ).as_unit(unit) + # reset freq to None + d = to_ts(Timestamp("2000-01-01 10:00", tz=tz)) + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.tz == expected.tz + assert result.freq is None # TODO: also changes DataFrame.__setitem__ with expansion def test_insert_mismatched_tzawareness(self): @@ -214,7 +224,7 @@ def test_insert_mismatched_tz(self): assert expected.dtype == idx.dtype tm.assert_index_equal(result, expected) - item = datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern")) + item = datetime(2000, 1, 4, tzinfo=zoneinfo.ZoneInfo("US/Eastern")) result = idx.insert(3, item) expected = Index( list(idx[:3]) + [item.astimezone(idx.tzinfo)] + list(idx[3:]), diff --git a/pandas/tests/indexes/datetimes/methods/test_shift.py b/pandas/tests/indexes/datetimes/methods/test_shift.py index 375dea01974bb..a202627550cd2 100644 --- a/pandas/tests/indexes/datetimes/methods/test_shift.py +++ b/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -1,7 +1,7 @@ from datetime import datetime +import zoneinfo import pytest -import pytz from pandas.errors import NullFrequencyError @@ -13,8 +13,6 @@ ) import pandas._testing as tm -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - class TestDatetimeIndexShift: # ------------------------------------------------------------- @@ -122,24 +120,28 @@ def test_dti_shift_across_dst(self, unit): ) def test_dti_shift_near_midnight(self, shift, result_time, unit): # GH 8616 - dt = datetime(2014, 11, 14, 0) - dt_est = pytz.timezone("EST").localize(dt) + tz = zoneinfo.ZoneInfo("US/Eastern") + dt_est = datetime(2014, 11, 14, 0, tzinfo=tz) idx = DatetimeIndex([dt_est]).as_unit(unit) ser = Series(data=[1], index=idx) result = ser.shift(shift, freq="h") - exp_index = DatetimeIndex([result_time], tz="EST").as_unit(unit) + exp_index = DatetimeIndex([result_time], tz=tz).as_unit(unit) expected = Series(1, index=exp_index) tm.assert_series_equal(result, expected) def test_shift_periods(self, unit): # GH#22458 : argument 'n' was deprecated in favor of 'periods' - idx = date_range(start=START, end=END, periods=3, unit=unit) + idx = date_range( + start=datetime(2009, 1, 1), end=datetime(2010, 1, 1), periods=3, unit=unit + ) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) @pytest.mark.parametrize("freq", ["B", "C"]) def test_shift_bday(self, freq, unit): - rng = date_range(START, END, freq=freq, unit=unit) + rng = date_range( + datetime(2009, 1, 1), datetime(2010, 1, 1), freq=freq, unit=unit + ) shifted = rng.shift(5) assert shifted[0] == rng[5] assert shifted.freq == rng.freq @@ -153,11 +155,21 @@ def test_shift_bday(self, freq, unit): assert shifted.freq == rng.freq def test_shift_bmonth(self, performance_warning, unit): - rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) + rng = date_range( + datetime(2009, 1, 1), + datetime(2010, 1, 1), + freq=pd.offsets.BMonthEnd(), + unit=unit, + ) shifted = rng.shift(1, freq=pd.offsets.BDay()) assert shifted[0] == rng[0] + pd.offsets.BDay() - rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) + rng = date_range( + datetime(2009, 1, 1), + datetime(2010, 1, 1), + freq=pd.offsets.BMonthEnd(), + unit=unit, + ) with tm.assert_produces_warning(performance_warning): shifted = rng.shift(1, freq=pd.offsets.CDay()) assert shifted[0] == rng[0] + pd.offsets.CDay() diff --git a/pandas/tests/indexes/datetimes/methods/test_snap.py b/pandas/tests/indexes/datetimes/methods/test_snap.py index 651e4383a3fac..a3c06ac6257cf 100644 --- a/pandas/tests/indexes/datetimes/methods/test_snap.py +++ b/pandas/tests/indexes/datetimes/methods/test_snap.py @@ -7,6 +7,8 @@ import pandas._testing as tm +@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") +@pytest.mark.filterwarnings("ignore:Period with BDay freq:FutureWarning") @pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) @pytest.mark.parametrize("name", [None, "my_dti"]) def test_dti_snap(name, tz, unit): @@ -27,7 +29,9 @@ def test_dti_snap(name, tz, unit): dti = dti.as_unit(unit) result = dti.snap(freq="W-MON") - expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") + msg = "'w-mon' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") expected = expected.repeat([3, 4]) expected = expected.as_unit(unit) tm.assert_index_equal(result, expected) @@ -37,7 +41,9 @@ def test_dti_snap(name, tz, unit): result = dti.snap(freq="B") - expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") + msg = "'b' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") expected = expected.repeat([1, 1, 1, 2, 2]) expected = expected.as_unit(unit) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 8e279162b7012..cd4a142dd5b30 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -1,7 +1,8 @@ +from datetime import timezone + import dateutil.tz from dateutil.tz import tzlocal import pytest -import pytz from pandas._libs.tslibs.ccalendar import MONTHS from pandas._libs.tslibs.offsets import MonthEnd @@ -155,7 +156,13 @@ def test_to_period_microsecond(self): @pytest.mark.parametrize( "tz", - ["US/Eastern", pytz.utc, tzlocal(), "dateutil/US/Eastern", dateutil.tz.tzutc()], + [ + "US/Eastern", + timezone.utc, + tzlocal(), + "dateutil/US/Eastern", + dateutil.tz.tzutc(), + ], ) def test_to_period_tz(self, tz): ts = date_range("1/1/2000", "2/1/2000", tz=tz) diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_convert.py b/pandas/tests/indexes/datetimes/methods/test_tz_convert.py index b2cf488ac8313..9eabb742b93a4 100644 --- a/pandas/tests/indexes/datetimes/methods/test_tz_convert.py +++ b/pandas/tests/indexes/datetimes/methods/test_tz_convert.py @@ -4,7 +4,6 @@ from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import timezones @@ -260,11 +259,14 @@ def test_dti_tz_convert_tzlocal(self): [ "US/Eastern", "dateutil/US/Eastern", - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), ], ) def test_dti_tz_convert_utc_to_local_no_modify(self, tz): + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tz) diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py index ad7769c6b9671..c6697fd169e8a 100644 --- a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py +++ b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py @@ -1,7 +1,9 @@ from datetime import ( datetime, timedelta, + timezone, ) +from zoneinfo import ZoneInfo import dateutil.tz from dateutil.tz import gettz @@ -19,22 +21,13 @@ ) import pandas._testing as tm -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type [misc] - ZoneInfo = None # type: ignore[misc, assignment] - -easts = [pytz.timezone("US/Eastern"), gettz("US/Eastern")] -if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Eastern") - except KeyError: - # no tzdata - pass - else: - easts.append(tz) +@pytest.fixture(params=["pytz/US/Eastern", gettz("US/Eastern"), ZoneInfo("US/Eastern")]) +def tz(request): + if isinstance(request.param, str) and request.param.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + return pytz.timezone(request.param.removeprefix("pytz/")) + return request.param class TestTZLocalize: @@ -88,7 +81,6 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): expected = dti.tz_convert("US/Eastern") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition @@ -96,7 +88,6 @@ def test_dti_tz_localize_ambiguous_infer(self, tz): with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): dr.tz_localize(tz) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): # With repeated hours, we can infer the transition dr = date_range( @@ -116,7 +107,6 @@ def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): result2 = DatetimeIndex(times, tz=tz, ambiguous="infer").as_unit(unit) tm.assert_index_equal(result2, expected) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer3(self, tz): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) @@ -124,7 +114,6 @@ def test_dti_tz_localize_ambiguous_infer3(self, tz): localized_infer = dr.tz_localize(tz, ambiguous="infer") tm.assert_index_equal(localized, localized_infer) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_times(self, tz): # March 13, 2011, spring forward, skip from 2 AM to 3 AM dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=offsets.Hour()) @@ -143,7 +132,7 @@ def test_dti_tz_localize_ambiguous_times(self, tz): # UTC is OK dr = date_range( - datetime(2011, 3, 13), periods=48, freq=offsets.Minute(30), tz=pytz.utc + datetime(2011, 3, 13), periods=48, freq=offsets.Minute(30), tz=timezone.utc ) @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) @@ -181,15 +170,6 @@ def test_dti_tz_localize(self, prefix): with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): dti.tz_localize(tzstr) - @pytest.mark.parametrize( - "tz", - [ - "US/Eastern", - "dateutil/US/Eastern", - pytz.timezone("US/Eastern"), - gettz("US/Eastern"), - ], - ) def test_dti_tz_localize_utc_conversion(self, tz): # Localizing to time zone should: # 1) check for DST ambiguities @@ -245,7 +225,6 @@ def test_dti_tz_localize_tzlocal(self): dti2 = dti.tz_localize(None) tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_nat(self, tz): times = [ "11/06/2011 00:00", @@ -270,7 +249,6 @@ def test_dti_tz_localize_ambiguous_nat(self, tz): # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] tm.assert_numpy_array_equal(di_test.values, localized.values) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_flags(self, tz, unit): # November 6, 2011, fall back, repeat 2 AM hour @@ -321,8 +299,7 @@ def test_dti_tz_localize_ambiguous_flags(self, tz, unit): dr = dr.append(dr) tm.assert_index_equal(dr, localized) - @pytest.mark.parametrize("tz", easts) - def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): + def test_dti_tz_localize_ambiguous_flags2(self, tz): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) is_dst = np.array([1] * 10) @@ -332,8 +309,8 @@ def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): def test_dti_tz_localize_bdate_range(self): dr = bdate_range("1/1/2009", "1/1/2010") - dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=pytz.utc) - localized = dr.tz_localize(pytz.utc) + dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=timezone.utc) + localized = dr.tz_localize(timezone.utc) tm.assert_index_equal(dr_utc, localized) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 43a7cdf63d9b9..aba440ceeb56b 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -7,6 +7,7 @@ ) from functools import partial from operator import attrgetter +import zoneinfo import dateutil import dateutil.tz @@ -152,7 +153,9 @@ def test_construction_caching(self): df = pd.DataFrame( { "dt": date_range("20130101", periods=3), - "dttz": date_range("20130101", periods=3, tz="US/Eastern"), + "dttz": date_range( + "20130101", periods=3, tz=zoneinfo.ZoneInfo("US/Eastern") + ), "dt_with_null": [ Timestamp("20130101"), pd.NaT, @@ -161,7 +164,7 @@ def test_construction_caching(self): "dtns": date_range("20130101", periods=3, freq="ns"), } ) - assert df.dttz.dtype.tz.zone == "US/Eastern" + assert df.dttz.dtype.tz.key == "US/Eastern" @pytest.mark.parametrize( "kwargs", @@ -198,7 +201,11 @@ def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): # incompat tz/dtype msg = "cannot supply both a tz and a dtype with a tz" with pytest.raises(ValueError, match=msg): - DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype, tz="US/Pacific") + DatetimeIndex( + i.tz_localize(None).asi8, + dtype=i.dtype, + tz=zoneinfo.ZoneInfo("US/Hawaii"), + ) def test_construction_index_with_mixed_timezones(self): # gh-11488: no tz results in DatetimeIndex @@ -736,7 +743,7 @@ def test_disallow_setting_tz(self): dti = DatetimeIndex(["2010"], tz="UTC") msg = "Cannot directly set timezone" with pytest.raises(AttributeError, match=msg): - dti.tz = pytz.timezone("US/Pacific") + dti.tz = zoneinfo.ZoneInfo("US/Pacific") @pytest.mark.parametrize( "tz", @@ -764,7 +771,9 @@ def test_constructor_start_end_with_tz(self, tz): @pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"]) def test_constructor_with_non_normalized_pytz(self, tz): # GH 18595 - non_norm_tz = Timestamp("2010", tz=tz).tz + pytz = pytest.importorskip("pytz") + tz_in = pytz.timezone(tz) + non_norm_tz = Timestamp("2010", tz=tz_in).tz result = DatetimeIndex(["2010"], tz=non_norm_tz) assert pytz.timezone(tz) is result.tz @@ -914,7 +923,9 @@ def test_index_constructor_with_numpy_object_array_and_timestamp_tz_with_nan(sel expected = DatetimeIndex([Timestamp("2019", tz="UTC"), pd.NaT]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Eastern"), gettz("US/Eastern")] + ) def test_dti_from_tzaware_datetime(self, tz): d = [datetime(2012, 8, 19, tzinfo=tz)] @@ -963,7 +974,7 @@ def test_dti_convert_datetime_list(self, tzstr): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), ], ) @@ -972,6 +983,8 @@ def test_dti_convert_datetime_list(self, tzstr): def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request): # GH#47471 check that we get the same raising behavior in the DTI # constructor and Timestamp constructor + if isinstance(tz, str) and tz.startswith("pytz/"): + tz = pytz.timezone(tz.removeprefix("pytz/")) dtstr = "2013-11-03 01:59:59.999999" item = dtstr if not use_str: diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 8bf51bcd38862..b37b5cf74b347 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -12,7 +12,6 @@ import numpy as np import pytest import pytz -from pytz import timezone from pandas._libs.tslibs import timezones from pandas._libs.tslibs.offsets import ( @@ -97,6 +96,7 @@ def test_date_range_timestamp_equiv_dateutil(self): assert ts == stamp def test_date_range_timestamp_equiv_explicit_pytz(self): + pytz = pytest.importorskip("pytz") rng = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) stamp = rng[0] @@ -490,7 +490,8 @@ def test_range_bug(self, unit): def test_range_tz_pytz(self): # see gh-2906 - tz = timezone("US/Eastern") + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") start = tz.localize(datetime(2011, 1, 1)) end = tz.localize(datetime(2011, 1, 3)) @@ -517,14 +518,16 @@ def test_range_tz_pytz(self): ], ) def test_range_tz_dst_straddle_pytz(self, start, end): - start = Timestamp(start, tz="US/Eastern") - end = Timestamp(end, tz="US/Eastern") + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") + start = Timestamp(start, tz=tz) + end = Timestamp(end, tz=tz) dr = date_range(start, end, freq="D") assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) - dr = date_range(start, end, freq="D", tz="US/Eastern") + dr = date_range(start, end, freq="D", tz=tz) assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) @@ -533,7 +536,7 @@ def test_range_tz_dst_straddle_pytz(self, start, end): start.replace(tzinfo=None), end.replace(tzinfo=None), freq="D", - tz="US/Eastern", + tz=tz, ) assert dr[0] == start assert dr[-1] == end @@ -788,6 +791,26 @@ def test_frequency_A_raises(self, freq): with pytest.raises(ValueError, match=msg): date_range("1/1/2000", periods=2, freq=freq) + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2W", "2w"), + ("2W-WED", "2w-wed"), + ("2B", "2b"), + ("2D", "2d"), + ("2C", "2c"), + ], + ) + def test_date_range_depr_lowercase_frequency(self, freq, freq_depr): + # GH#58998 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + "in a future version." + + expected = date_range("1/1/2000", periods=4, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + class TestDateRangeTZ: """Tests for date_range with timezones""" diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index cc2b802de2a16..04334a1d8d0c8 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -133,29 +133,12 @@ def test_asarray_tz_aware(self): tm.assert_numpy_array_equal(result, expected) - def test_CBH_deprecated(self): - msg = "'CBH' is deprecated and will be removed in a future version." - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = date_range( - dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq="CBH" - ) - result = DatetimeIndex( - [ - "2022-12-12 09:00:00", - "2022-12-12 10:00:00", - "2022-12-12 11:00:00", - "2022-12-12 12:00:00", - "2022-12-12 13:00:00", - "2022-12-12 14:00:00", - "2022-12-12 15:00:00", - "2022-12-12 16:00:00", - ], - dtype="datetime64[ns]", - freq="cbh", - ) + @pytest.mark.parametrize("freq", ["2H", "2BH", "2S"]) + def test_CBH_raises(self, freq): + msg = f"Invalid frequency: {freq}" - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range(dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq=freq) @pytest.mark.parametrize("freq", ["2BM", "1bm", "2BQ", "1BQ-MAR", "2BY-JUN", "1by"]) def test_BM_BQ_BY_raises(self, freq): diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 6e4e22942ab07..4551fdf073193 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -1,9 +1,11 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import dateutil.tz import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -276,7 +278,7 @@ def test_dti_summary(self): result = idx._summary() assert result == expected - @pytest.mark.parametrize("tz", [None, pytz.utc, dateutil.tz.tzutc()]) + @pytest.mark.parametrize("tz", [None, timezone.utc, dateutil.tz.tzutc()]) @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_repr_etc_smoke(self, tz, freq): # only really care that it works diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 173b32b12e2d1..94175a56f1c4a 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -35,7 +35,7 @@ def test_string_index_series_name_converted(self): def test_stringified_slice_with_tz(self): # GH#2658 start = "2013-01-07" - idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") + idx = date_range(start=start, freq="1D", periods=10, tz="US/Eastern") df = DataFrame(np.arange(10), index=idx) df["2013-01-14 23:44:34.437768-05:00":] # no exception here diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index fc3a1d4721841..f04f1592ea0c1 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -1,11 +1,11 @@ from datetime import ( datetime, + timedelta, timezone, ) import numpy as np import pytest -import pytz import pandas.util._test_decorators as td @@ -560,6 +560,7 @@ def test_intersection_list(self): tm.assert_index_equal(res, idx) def test_month_range_union_tz_pytz(self, sort): + pytz = pytest.importorskip("pytz") tz = pytz.timezone("US/Eastern") early_start = datetime(2011, 1, 1) @@ -648,7 +649,7 @@ def test_intersection_bug(self): assert result.freq == b.freq @pytest.mark.parametrize( - "tz", [None, "UTC", "Europe/Berlin", pytz.FixedOffset(-60)] + "tz", [None, "UTC", "Europe/Berlin", timezone(timedelta(hours=-1))] ) def test_intersection_dst_transition(self, tz): # GH 46702: Europe/Berlin has DST transition @@ -664,3 +665,19 @@ def test_intersection_dst_transition(self, tz): result = index1.union(index2) expected = date_range("2021-10-28", periods=6, freq="D", tz="Europe/London") tm.assert_index_equal(result, expected) + + +def test_union_non_nano_rangelike(): + # GH 59036 + l1 = DatetimeIndex( + ["2024-05-11", "2024-05-12"], dtype="datetime64[us]", name="Date", freq="D" + ) + l2 = DatetimeIndex(["2024-05-13"], dtype="datetime64[us]", name="Date", freq="D") + result = l1.union(l2) + expected = DatetimeIndex( + ["2024-05-11", "2024-05-12", "2024-05-13"], + dtype="datetime64[us]", + name="Date", + freq="D", + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 0c8bdbdd2fb22..e4b8a909add0d 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -8,11 +8,11 @@ timezone, tzinfo, ) +import zoneinfo from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( conversion, @@ -184,8 +184,11 @@ def test_dti_tz_nat(self, tzstr): assert isna(idx[1]) assert idx[0].tzinfo is not None - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + @pytest.mark.parametrize("tzstr", ["pytz/US/Eastern", "dateutil/US/Eastern"]) def test_utc_box_timestamp_and_localize(self, tzstr): + if tzstr.startswith("pytz/"): + pytest.importorskip("pytz") + tzstr = tzstr.removeprefix("pytz/") tz = timezones.maybe_get_tz(tzstr) rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") @@ -206,15 +209,17 @@ def test_utc_box_timestamp_and_localize(self, tzstr): rng_eastern[0].tzinfo ) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Central"), gettz("US/Central")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Central"), gettz("US/Central")] + ) def test_with_tz(self, tz): # just want it to work - start = datetime(2011, 3, 12, tzinfo=pytz.utc) + start = datetime(2011, 3, 12, tzinfo=timezone.utc) dr = bdate_range(start, periods=50, freq=pd.offsets.Hour()) - assert dr.tz is pytz.utc + assert dr.tz is timezone.utc # DateRange with naive datetimes - dr = bdate_range("1/1/2005", "1/1/2009", tz=pytz.utc) + dr = bdate_range("1/1/2005", "1/1/2009", tz=timezone.utc) dr = bdate_range("1/1/2005", "1/1/2009", tz=tz) # normalized @@ -231,13 +236,16 @@ def test_with_tz(self, tz): # datetimes with tzinfo set dr = bdate_range( - datetime(2005, 1, 1, tzinfo=pytz.utc), datetime(2009, 1, 1, tzinfo=pytz.utc) + datetime(2005, 1, 1, tzinfo=timezone.utc), + datetime(2009, 1, 1, tzinfo=timezone.utc), ) msg = "Start and end cannot both be tz-aware with different timezones" with pytest.raises(Exception, match=msg): - bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) + bdate_range(datetime(2005, 1, 1, tzinfo=timezone.utc), "1/1/2009", tz=tz) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Eastern"), gettz("US/Eastern")] + ) def test_dti_convert_tz_aware_datetime_datetime(self, tz): # GH#1581 dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 06dbb33aadf97..cc3dadc6bb61c 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -1,8 +1,8 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -114,11 +114,11 @@ def test_append_index(): result = idx1.append(midx_lv2) # see gh-7112 - tz = pytz.timezone("Asia/Tokyo") + tz = zoneinfo.ZoneInfo("Asia/Tokyo") expected_tuples = [ - (1.1, tz.localize(datetime(2011, 1, 1))), - (1.2, tz.localize(datetime(2011, 1, 2))), - (1.3, tz.localize(datetime(2011, 1, 3))), + (1.1, datetime(2011, 1, 1, tzinfo=tz)), + (1.2, datetime(2011, 1, 2, tzinfo=tz)), + (1.3, datetime(2011, 1, 3, tzinfo=tz)), ] expected = Index([1.1, 1.2, 1.3] + expected_tuples) tm.assert_index_equal(result, expected) @@ -138,9 +138,9 @@ def test_append_index(): expected = Index._simple_new( np.array( [ - (1.1, tz.localize(datetime(2011, 1, 1)), "A"), - (1.2, tz.localize(datetime(2011, 1, 2)), "B"), - (1.3, tz.localize(datetime(2011, 1, 3)), "C"), + (1.1, datetime(2011, 1, 1, tzinfo=tz), "A"), + (1.2, datetime(2011, 1, 2, tzinfo=tz), "B"), + (1.3, datetime(2011, 1, 3, tzinfo=tz), "C"), ] + expected_tuples, dtype=object, diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/indexes/multi/test_util.py similarity index 78% rename from pandas/tests/reshape/test_util.py rename to pandas/tests/indexes/multi/test_util.py index d2971db3d7aa2..68792ce53f04e 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/indexes/multi/test_util.py @@ -6,7 +6,7 @@ date_range, ) import pandas._testing as tm -from pandas.core.reshape.util import cartesian_product +from pandas.core.indexes.multi import cartesian_product class TestCartesianProduct: @@ -28,22 +28,6 @@ def test_datetimeindex(self): tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) - def test_tzaware_retained(self): - x = date_range("2000-01-01", periods=2, tz="US/Pacific") - y = np.array([3, 4]) - result1, result2 = cartesian_product([x, y]) - - expected = x.repeat(2) - tm.assert_index_equal(result1, expected) - - def test_tzaware_retained_categorical(self): - x = date_range("2000-01-01", periods=2, tz="US/Pacific").astype("category") - y = np.array([3, 4]) - result1, result2 = cartesian_product([x, y]) - - expected = x.repeat(2) - tm.assert_index_equal(result1, expected) - @pytest.mark.parametrize("x, y", [[[], []], [[0, 1], []], [[], ["a", "b", "c"]]]) def test_empty(self, x, y): # product of empty factors diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index aca765e7167b2..be07a71b283fd 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -73,6 +73,30 @@ def test_period_index_T_L_U_N_raises(self, freq_depr): with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + @pytest.mark.filterwarnings("ignore:Period with BDay freq:FutureWarning") + @pytest.mark.parametrize( + "freq,freq_depr", + [("2W", "2w"), ("2W-FRI", "2w-fri"), ("2D", "2d"), ("2B", "2b")], + ) + def test_period_index_depr_lowercase_frequency(self, freq, freq_depr): + # GH#58998 + msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) + + expected = PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) + + expected = period_range(start="2020-01-01", end="2020-01-02", freq=freq) + tm.assert_index_equal(result, expected) + class TestPeriodIndex: def test_from_ordinals(self): diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 4e58dc1f324b2..51b03024ce272 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -203,7 +203,7 @@ def test_constructor_U(self): with pytest.raises(ValueError, match="Invalid frequency: X"): period_range("2007-1-1", periods=500, freq="X") - @pytest.mark.parametrize("freq_depr", ["2H", "2MIN", "2S", "2US", "2NS"]) + @pytest.mark.parametrize("freq_depr", ["2MIN", "2US", "2NS"]) def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): # GH#52536, GH#54939 msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " @@ -212,9 +212,9 @@ def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): with tm.assert_produces_warning(FutureWarning, match=msg): period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) - @pytest.mark.parametrize("freq", ["2m", "2q-sep", "2y"]) - def test_lowercase_freq_from_time_series_raises(self, freq): - # GH#52536, GH#54939 + @pytest.mark.parametrize("freq", ["2m", "2q-sep", "2y", "2H", "2S"]) + def test_incorrect_case_freq_from_time_series_raises(self, freq): + # GH#52536, GH#54939, GH#59143 msg = f"Invalid frequency: {freq}" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e701a49ea93ad..16908fbb4fecc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1558,7 +1558,7 @@ def test_ensure_index_uint64(self): def test_get_combined_index(self): result = _get_combined_index([]) - expected = Index([]) + expected = RangeIndex(0) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/timedeltas/methods/test_shift.py b/pandas/tests/indexes/timedeltas/methods/test_shift.py index a0986d1496881..9bbf06dc51a0c 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_shift.py +++ b/pandas/tests/indexes/timedeltas/methods/test_shift.py @@ -37,7 +37,7 @@ def test_tdi_shift_minutes(self): def test_tdi_shift_int(self): # GH#8083 - tdi = pd.to_timedelta(range(5), unit="d") + tdi = pd.to_timedelta(range(5), unit="D") trange = tdi._with_freq("infer") + pd.offsets.Hour(1) result = trange.shift(1) expected = TimedeltaIndex( @@ -54,7 +54,7 @@ def test_tdi_shift_int(self): def test_tdi_shift_nonstandard_freq(self): # GH#8083 - tdi = pd.to_timedelta(range(5), unit="d") + tdi = pd.to_timedelta(range(5), unit="D") trange = tdi._with_freq("infer") + pd.offsets.Hour(1) result = trange.shift(3, freq="2D 1s") expected = TimedeltaIndex( diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index 12ac5dd63bd8c..ace0ab7990138 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -168,7 +168,7 @@ def test_constructor_coverage(self): # NumPy string array strings = np.array(["1 days", "2 days", "3 days"]) result = TimedeltaIndex(strings) - expected = to_timedelta([1, 2, 3], unit="d") + expected = to_timedelta([1, 2, 3], unit="D") tm.assert_index_equal(result, expected) from_ints = TimedeltaIndex(expected.asi8) @@ -239,3 +239,28 @@ def test_from_categorical(self): ci = pd.CategoricalIndex(tdi) result = TimedeltaIndex(ci) tm.assert_index_equal(result, tdi) + + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): + # GH#52536, GH#59051 + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." + + expected = TimedeltaIndex([f"1{unit}", f"2{unit}"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = TimedeltaIndex([f"1{unit_depr}", f"2{unit_depr}"]) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg): + tdi = to_timedelta([1, 2], unit=unit_depr) + tm.assert_index_equal(tdi, expected) diff --git a/pandas/tests/indexes/timedeltas/test_delete.py b/pandas/tests/indexes/timedeltas/test_delete.py index 6e6f54702ce1a..f49af7cd0befd 100644 --- a/pandas/tests/indexes/timedeltas/test_delete.py +++ b/pandas/tests/indexes/timedeltas/test_delete.py @@ -44,7 +44,7 @@ def test_delete_slice(self): # reset freq to None expected_3_5 = TimedeltaIndex( - ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" + ["1 D", "2 D", "3 D", "7 D", "8 D", "9 D", "10D"], freq=None, name="idx" ) cases = { diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 397f9d9e18331..e411555c65bea 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -20,8 +20,10 @@ class TestGetItem: def test_getitem_slice_keeps_name(self): - # GH#4226 - tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") + # GH#4226, GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") assert tdi[1:].name == tdi.name def test_getitem(self): @@ -230,7 +232,7 @@ def test_take_invalid_kwargs(self): def test_take_equiv_getitem(self): tds = ["1day 02:00:00", "1 day 04:00:00", "1 day 10:00:00"] - idx = timedelta_range(start="1d", end="2d", freq="h", name="idx") + idx = timedelta_range(start="1D", end="2D", freq="h", name="idx") expected = TimedeltaIndex(tds, freq=None, name="idx") taken1 = idx.take([2, 4, 10]) @@ -337,8 +339,10 @@ def test_contains_nonunique(self): def test_contains(self): # Checking for any NaT-like objects - # GH#13603 - td = to_timedelta(range(5), unit="d") + offsets.Hour(1) + # GH#13603, GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + td = to_timedelta(range(5), unit="d") + offsets.Hour(1) for v in [NaT, None, float("nan"), np.nan]: assert v not in td diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 9f0552f8baa90..9a00c556dc515 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -103,30 +103,34 @@ def test_round(self): t1c = TimedeltaIndex(np.array([1, 1, 1], "m8[D]")).as_unit("ns") # note that negative times round DOWN! so don't give whole numbers - for freq, s1, s2 in [ - ("ns", t1, t2), - ("us", t1, t2), - ( - "ms", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + for freq, s1, s2 in [ + ("ns", t1, t2), + ("us", t1, t2), + ( + "ms", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ), ), - ), - ( - "s", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ( + "s", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ), ), - ), - ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("d", t1c, -1 * t1c), - ]: - r1 = t1.round(freq) + ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("d", t1c, -1 * t1c), + ]: + r1 = t1.round(freq) + r2 = t2.round(freq) + tm.assert_index_equal(r1, s1) - r2 = t2.round(freq) tm.assert_index_equal(r2, s2) def test_components(self): diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index fce10d9176d74..ae88caf18fdae 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -42,7 +42,10 @@ def test_union_sort_false(self): tm.assert_index_equal(result, expected) def test_union_coverage(self): - idx = TimedeltaIndex(["3d", "1d", "2d"]) + # GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = TimedeltaIndex(["3d", "1d", "2d"]) ordered = TimedeltaIndex(idx.sort_values(), freq="infer") result = ordered.union(idx) tm.assert_index_equal(result, ordered) @@ -70,7 +73,7 @@ def test_union_bug_1745(self): tm.assert_index_equal(result, exp) def test_union_bug_4564(self): - left = timedelta_range("1 day", "30d") + left = timedelta_range("1 day", "30D") right = left + pd.offsets.Minute(15) result = left.union(right) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 1b645e2bc607f..6f3d29fb4240a 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -3,7 +3,6 @@ from pandas import ( Timedelta, - TimedeltaIndex, timedelta_range, to_timedelta, ) @@ -70,14 +69,12 @@ def test_linspace_behavior(self, periods, freq): expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) - def test_timedelta_range_H_deprecated(self): + def test_timedelta_range_H_raises(self): # GH#52536 - msg = "'H' is deprecated and will be removed in a future version." + msg = "Invalid frequency: H" - result = timedelta_range(start="0 days", end="4 days", periods=6) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = timedelta_range(start="0 days", end="4 days", freq="19H12min") - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + timedelta_range(start="0 days", end="4 days", freq="19H12min") def test_timedelta_range_T_raises(self): msg = "Invalid frequency: T" @@ -130,33 +127,6 @@ def test_timedelta_range_infer_freq(self): result = timedelta_range("0s", "1s", periods=31) assert result.freq is None - @pytest.mark.parametrize( - "freq_depr, start, end, expected_values, expected_freq", - [ - ( - "3.5S", - "05:03:01", - "05:03:10", - ["0 days 05:03:01", "0 days 05:03:04.500000", "0 days 05:03:08"], - "3500ms", - ), - ], - ) - def test_timedelta_range_deprecated_freq( - self, freq_depr, start, end, expected_values, expected_freq - ): - # GH#52536 - msg = ( - f"'{freq_depr[-1]}' is deprecated and will be removed in a future version." - ) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = timedelta_range(start=start, end=end, freq=freq_depr) - expected = TimedeltaIndex( - expected_values, dtype="timedelta64[ns]", freq=expected_freq - ) - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( "freq_depr, start, end", [ @@ -170,9 +140,15 @@ def test_timedelta_range_deprecated_freq( "5 hours", "5 hours 8 minutes", ), + ( + "3.5S", + "05:03:01", + "05:03:10", + ), ], ) def test_timedelta_range_removed_freq(self, freq_depr, start, end): + # GH#59143 msg = f"Invalid frequency: {freq_depr}" with pytest.raises(ValueError, match=msg): timedelta_range(start=start, end=end, freq=freq_depr) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 481a77fd03b05..7140ad7d1e9f5 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -232,3 +232,20 @@ def test_multiindex_from_tuples_with_nan(self): [("a", "b", "c"), (np.nan, np.nan, np.nan), ("d", "", "")] ) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("operation", ["div", "mul", "add", "sub"]) + def test_groupyby_rename_categories_operation_with_multiindex(self, operation): + # GH#51500 + data = DataFrame( + [["C", "B", "B"], ["B", "A", "A"], ["B", "A", "B"]], columns=["0", "1", "2"] + ) + data["0"] = data["0"].astype("category") + data["0"] = data["0"].cat.rename_categories({"C": "B", "B": "C"}) + + a = data.groupby(by=["0", "1"])["2"].value_counts() + b = data.groupby(by=["0", "1"]).size() + + result = getattr(a, operation)(b) + expected = getattr(a, operation)(b.sort_index(ascending=False)) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index abf89c2b0d096..d732cb4d7fbbc 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -213,13 +213,11 @@ def test_multiindex_assignment_single_dtype(self): tm.assert_series_equal(result, exp) # arr + 0.5 cannot be cast losslessly to int, so we upcast - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[4, "c"] = arr + 0.5 - result = df.loc[4, "c"] - exp = exp + 0.5 - tm.assert_series_equal(result, exp) + # Upcast so that we can add .5 + df = df.astype({"c": "float64"}) + df.loc[4, "c"] = arr + 0.5 # scalar ok df.loc[4, "c"] = 10 diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 217ca74bd7fbd..10a8fa88b4b5e 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -24,12 +24,8 @@ def test_at_timezone(): # https://github.com/pandas-dev/pandas/issues/33544 result = DataFrame({"foo": [datetime(2000, 1, 1)]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result.at[0, "foo"] = datetime(2000, 1, 2, tzinfo=timezone.utc) - expected = DataFrame( - {"foo": [datetime(2000, 1, 2, tzinfo=timezone.utc)]}, dtype=object - ) - tm.assert_frame_equal(result, expected) def test_selection_methods_of_assigned_col(): diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 1b58f8e8b9831..c9f29b2cb55fe 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -511,13 +511,13 @@ def test_loc_and_at_with_categorical_index(self): # pandas scalars [Interval(1, 4), Interval(4, 6), Interval(6, 9)], [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)], - [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], + [Timedelta(1, "D"), Timedelta(2, "D"), Timedelta(3, "D")], # pandas Integer arrays *(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES), # other pandas arrays pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, pd.date_range("2019-01-01", periods=3).array, - pd.timedelta_range(start="1d", periods=3).array, + pd.timedelta_range(start="1D", periods=3).array, ], ) def test_loc_getitem_with_non_string_categories(self, idx_values, ordered): diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index efae0b4dd84cc..64d8068fa9291 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -285,11 +285,9 @@ def test_detect_chained_assignment_changing_dtype(self): df.loc[2]["C"] = "foo" tm.assert_frame_equal(df, df_original) # TODO: Use tm.raises_chained_assignment_error() when PDEP-6 is enforced - with tm.raises_chained_assignment_error( - extra_warnings=(FutureWarning,), extra_match=(None,) - ): - df["C"][2] = "foo" - tm.assert_frame_equal(df, df_original) + with pytest.raises(TypeError, match="Invalid value"): + with tm.raises_chained_assignment_error(): + df["C"][2] = "foo" def test_setting_with_copy_bug(self): # operating on a copy diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 8b90a6c32849d..417925f8ecb0d 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -9,7 +9,6 @@ from pandas.errors import IndexingError from pandas import ( - NA, Categorical, CategoricalDtype, DataFrame, @@ -528,10 +527,9 @@ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(self): assert len(df._mgr.blocks) == 1 # if the assigned values cannot be held by existing integer arrays, - # we cast - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + # we raise + with pytest.raises(TypeError, match="Invalid value"): df.iloc[:, 0] = df.iloc[:, 0] + 0.5 - assert len(df._mgr.blocks) == 2 expected = df.copy() @@ -1445,7 +1443,5 @@ def test_iloc_setitem_pure_position_based(self): def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") - expected = DataFrame({"a": ["test"], "b": array([NA], dtype="Int64")}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 60a3ccf0b7483..61cbb1983e49a 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -180,14 +180,8 @@ def test_setitem_dtype_upcast(self): df["c"] = np.nan assert df["c"].dtype == np.float64 - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[0, "c"] = "foo" - expected = DataFrame( - {"a": [1, 3], "b": [np.nan, 2], "c": Series(["foo", np.nan], dtype=object)} - ) - tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("val", [3.14, "wxyz"]) def test_setitem_dtype_upcast2(self, val): @@ -199,19 +193,8 @@ def test_setitem_dtype_upcast2(self, val): ) left = df.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left.loc["a", "bar"] = val - right = DataFrame( - [[0, val, 2], [3, 4, 5]], - index=list("ab"), - columns=["foo", "bar", "baz"], - ) - - tm.assert_frame_equal(left, right) - assert is_integer_dtype(left["foo"]) - assert is_integer_dtype(left["baz"]) def test_setitem_dtype_upcast3(self): left = DataFrame( @@ -219,21 +202,9 @@ def test_setitem_dtype_upcast3(self): index=list("ab"), columns=["foo", "bar", "baz"], ) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left.loc["a", "bar"] = "wxyz" - right = DataFrame( - [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]], - index=list("ab"), - columns=["foo", "bar", "baz"], - ) - - tm.assert_frame_equal(left, right) - assert is_float_dtype(left["foo"]) - assert is_float_dtype(left["baz"]) - def test_dups_fancy_indexing(self): # GH 3455 @@ -728,7 +699,7 @@ def run_tests(df, rhs, right_loc, right_iloc): frame["jolie"] = frame["jolie"].map(lambda x: f"@{x}") right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0] right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): run_tests(df, rhs, right_loc, right_iloc) @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 16f3e0fd0c229..b8d012eca28ce 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -16,7 +16,6 @@ from pandas._config import using_pyarrow_string_dtype from pandas._libs import index as libindex -from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError import pandas as pd @@ -383,12 +382,8 @@ def test_loc_setitem_slice(self): df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") ix = df1["a"] == 1 newb2 = df2.loc[ix, "b"] - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df1.loc[ix, "b"] = newb2 - expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") - tm.assert_frame_equal(df2, expected) def test_loc_setitem_dtype(self): # GH31340 @@ -572,54 +567,31 @@ def frame_for_consistency(self): def test_loc_setitem_consistency(self, frame_for_consistency, val): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series(0, index=range(5), dtype=np.int64), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = val - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series("foo", index=range(5)), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = "foo" - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series(1.0, index=range(5)), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = 1.0 - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_single_row(self): # GH 15494 # setting on frame with single row df = DataFrame({"date": Series([Timestamp("20180101")])}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = "string" - expected = DataFrame({"date": Series(["string"])}) - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_empty(self): # empty (essentially noops) @@ -677,16 +649,11 @@ def test_loc_setitem_consistency_slice_column_len(self): # timedelta64[m] -> float, so this cannot be done inplace, so # no warning - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, ("Respondent", "Duration")] = df.loc[ :, ("Respondent", "Duration") ] / Timedelta(60_000_000_000) - expected = Series( - [23.0, 12.0, 14.0, 36.0], index=df.index, name=("Respondent", "Duration") - ) - tm.assert_series_equal(df[("Respondent", "Duration")], expected) - @pytest.mark.parametrize("unit", ["Y", "M", "D", "h", "m", "s", "ms", "us"]) def test_loc_assign_non_ns_datetime(self, unit): # GH 27395, non-ns dtype assignment via .loc should work @@ -1281,7 +1248,7 @@ def test_loc_getitem_time_object(self, frame_or_series): tm.assert_equal(result, expected) @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) - @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): sp_sparse = pytest.importorskip("scipy.sparse") @@ -1296,13 +1263,13 @@ def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): # regression test for GH#34526 itr_idx = range(2, rows) - result = df.loc[itr_idx].values + result = np.nan_to_num(df.loc[itr_idx].values) expected = spmatrix.toarray()[itr_idx] tm.assert_numpy_array_equal(result, expected) # regression test for GH#34540 result = df.loc[itr_idx].dtypes.values - expected = np.full(cols, SparseDtype(dtype, fill_value=0)) + expected = np.full(cols, SparseDtype(dtype)) tm.assert_numpy_array_equal(result, expected) def test_loc_getitem_listlike_all_retains_sparse(self): @@ -1314,18 +1281,16 @@ def test_loc_getitem_sparse_frame(self): # GH34687 sp_sparse = pytest.importorskip("scipy.sparse") - df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5)) + df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5, dtype=np.int64)) result = df.loc[range(2)] expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0]], - dtype=SparseDtype("float64", 0.0), + [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]], + dtype=SparseDtype(np.int64), ) tm.assert_frame_equal(result, expected) result = df.loc[range(2)].loc[range(1)] - expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0]], dtype=SparseDtype("float64", 0.0) - ) + expected = DataFrame([[1, 0, 0, 0, 0]], dtype=SparseDtype(np.int64)) tm.assert_frame_equal(result, expected) def test_loc_getitem_sparse_series(self): @@ -1413,13 +1378,9 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): # Assigning a Category to parts of a int/... column uses the values of # the Categorical df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) - exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 @@ -1446,9 +1407,8 @@ def test_loc_setitem_datetime_coercion(self): df.loc[0:1, "c"] = np.datetime64("2008-08-08") assert Timestamp("2008-08-08") == df.loc[0, "c"] assert Timestamp("2008-08-08") == df.loc[1, "c"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[2, "c"] = date(2005, 5, 5) - assert Timestamp("2005-05-05").date() == df.loc[2, "c"] @pytest.mark.parametrize("idxer", ["var", ["var"]]) def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): @@ -1459,12 +1419,13 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # if result started off with object dtype, then the .loc.__setitem__ # below would retain object dtype result = DataFrame(index=idx, columns=["var"], dtype=np.float64) - with tm.assert_produces_warning( - FutureWarning if idxer == "var" else None, match="incompatible dtype" - ): + if idxer == "var": + with pytest.raises(TypeError, match="Invalid value"): + result.loc[:, idxer] = expected + else: # See https://github.com/pandas-dev/pandas/issues/56223 result.loc[:, idxer] = expected - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_loc_setitem_time_key(self): index = date_range("2012-01-01", "2012-01-05", freq="30min") @@ -1610,16 +1571,8 @@ def test_loc_setitem_cast2(self): # dtype conversion on setting df = DataFrame(np.random.default_rng(2).random((30, 3)), columns=tuple("ABC")) df["event"] = np.nan - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[10, "event"] = "foo" - result = df.dtypes - expected = Series( - [np.dtype("float64")] * 3 + [np.dtype("object")], - index=["A", "B", "C", "event"], - ) - tm.assert_series_equal(result, expected) def test_loc_setitem_cast3(self): # Test that data type is preserved . GH#5782 @@ -2974,20 +2927,9 @@ def test_loc_setitem_uint8_upcast(value): # GH#26049 df = DataFrame([1, 2, 3, 4], columns=["col1"], dtype="uint8") - with tm.assert_produces_warning(FutureWarning, match="item of incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[2, "col1"] = value # value that can't be held in uint8 - if np_version_gt2 and isinstance(value, np.int16): - # Note, result type of uint8 + int16 is int16 - # in numpy < 2, though, numpy would inspect the - # value and see that it could fit in an uint16, resulting in a uint16 - dtype = "int16" - else: - dtype = "uint16" - - expected = DataFrame([1, 2, 300, 4], columns=["col1"], dtype=dtype) - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "fill_val,exp_dtype", diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index fca1ed39c0f9c..579d3fbfb3435 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1280,20 +1280,19 @@ def test_interval_can_hold_element(self, dtype, element): # `elem` to not have the same length as `arr` ii2 = IntervalIndex.from_breaks(arr[:-1], closed="neither") elem = element(ii2) - msg = "Setting an item of incompatible dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii3 = IntervalIndex.from_breaks([Timestamp(1), Timestamp(3), Timestamp(4)]) elem = element(ii3) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii4 = IntervalIndex.from_breaks([Timedelta(1), Timedelta(3), Timedelta(4)]) elem = element(ii4) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) @@ -1313,13 +1312,12 @@ def test_period_can_hold_element(self, element): # `elem` to not have the same length as `arr` pi2 = pi.asfreq("D")[:-1] elem = element(pi2) - msg = "Setting an item of incompatible dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, pi, False) dti = pi.to_timestamp("s")[:-1] elem = element(dti) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, pi, False) def check_can_hold_element(self, obj, elem, inplace: bool): diff --git a/pandas/tests/io/data/excel/test_boolean_types.xlsx b/pandas/tests/io/data/excel/test_boolean_types.xlsx new file mode 100644 index 0000000000000..234703c32f0ab Binary files /dev/null and b/pandas/tests/io/data/excel/test_boolean_types.xlsx differ diff --git a/pandas/tests/io/data/excel/test_none_type.xlsx b/pandas/tests/io/data/excel/test_none_type.xlsx new file mode 100644 index 0000000000000..38aaf72ddfc8f Binary files /dev/null and b/pandas/tests/io/data/excel/test_none_type.xlsx differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 6d6c3ad6b77a7..5ce78b1c90e76 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -164,6 +164,36 @@ def xfail_datetimes_with_pyxlsb(engine, request): class TestReaders: + @pytest.mark.parametrize("col", [[True, None, False], [True], [True, False]]) + def test_read_excel_type_check(self, col, datapath): + # GH 58159 + df = DataFrame({"bool_column": col}, dtype="boolean") + f_path = datapath("io", "data", "excel", "test_boolean_types.xlsx") + + df.to_excel(f_path, index=False) + df2 = pd.read_excel(f_path, dtype={"bool_column": "boolean"}, engine="openpyxl") + tm.assert_frame_equal(df, df2) + + def test_pass_none_type(self, datapath): + # GH 58159 + f_path = datapath("io", "data", "excel", "test_none_type.xlsx") + + with pd.ExcelFile(f_path) as excel: + parsed = pd.read_excel( + excel, + sheet_name="Sheet1", + keep_default_na=True, + na_values=["nan", "None", "abcd"], + dtype="boolean", + engine="openpyxl", + ) + expected = DataFrame( + {"Test": [True, None, False, None, False, None, True]}, + dtype="boolean", + ) + + tm.assert_frame_equal(parsed, expected) + @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch): """ diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 744fe20e4995d..482b331332462 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -12,7 +12,6 @@ import numpy as np import pytest -from pandas.compat._constants import PY310 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -50,7 +49,7 @@ def frame(float_frame): return float_frame[:10] -@pytest.fixture(params=[True, False]) +@pytest.fixture(params=[True, False, "columns"]) def merge_cells(request): return request.param @@ -1251,13 +1250,12 @@ def test_engine_kwargs(self, engine, tmp_excel): "xlsxwriter": r"__init__() got an unexpected keyword argument 'foo'", } - if PY310: - msgs["openpyxl"] = ( - "Workbook.__init__() got an unexpected keyword argument 'foo'" - ) - msgs["xlsxwriter"] = ( - "Workbook.__init__() got an unexpected keyword argument 'foo'" - ) + msgs["openpyxl"] = ( + "Workbook.__init__() got an unexpected keyword argument 'foo'" + ) + msgs["xlsxwriter"] = ( + "Workbook.__init__() got an unexpected keyword argument 'foo'" + ) # Handle change in error message for openpyxl (write and append mode) if engine == "openpyxl" and not os.path.exists(tmp_excel): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 49776d532db1d..7bf041a50b745 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -221,7 +221,7 @@ def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype): def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) - df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")}) + df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="D")}) expected_rows = [ ",A", diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py index fffb1b9b9d2a4..7aa7cebb5120f 100644 --- a/pandas/tests/io/formats/test_to_markdown.py +++ b/pandas/tests/io/formats/test_to_markdown.py @@ -11,7 +11,7 @@ def test_keyword_deprecation(): # GH 57280 msg = ( - "Starting with pandas version 3.0.0 all arguments of to_markdown " + "Starting with pandas version 4.0 all arguments of to_markdown " "except for the argument 'buf' will be keyword-only." ) s = pd.Series() diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 7c7069aa74eeb..ed871577d677f 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -38,7 +38,7 @@ class TestDataFrameToStringFormatters: def test_keyword_deprecation(self): # GH 57280 msg = ( - "Starting with pandas version 3.0.0 all arguments of to_string " + "Starting with pandas version 4.0 all arguments of to_string " "except for the argument 'buf' will be keyword-only." ) s = Series(["a", "b"]) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index a0d5b3a741aaf..e61a8ee722443 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -32,7 +32,7 @@ def df_schema(): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), "D": pd.timedelta_range("1h", periods=4, freq="min"), }, index=pd.Index(range(4), name="idx"), @@ -45,12 +45,12 @@ def df_table(): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), "D": pd.timedelta_range("1h", periods=4, freq="min"), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.0, 2.0, 3, 4.0], - "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), + "H": pd.date_range("2016-01-01", freq="D", periods=4, tz="US/Central"), }, index=pd.Index(range(4), name="idx"), ) @@ -687,7 +687,7 @@ class TestTableOrientReader: {"ints": [1, 2, 3, 4]}, {"objects": ["a", "b", "c", "d"]}, {"objects": ["1", "2", "3", "4"]}, - {"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)}, + {"date_ranges": pd.date_range("2016-01-01", freq="D", periods=4)}, {"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))}, { "ordered_cats": pd.Series( @@ -699,7 +699,7 @@ class TestTableOrientReader: {"bools": [True, False, False, True]}, { "timezones": pd.date_range( - "2016-01-01", freq="d", periods=4, tz="US/Central" + "2016-01-01", freq="D", periods=4, tz="US/Central" ) # added in # GH 35973 }, ], @@ -738,7 +738,7 @@ def test_read_json_table_orient_raises(self, index_nm): {"ints": [1, 2, 3, 4]}, {"objects": ["a", "b", "c", "d"]}, {"objects": ["1", "2", "3", "4"]}, - {"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)}, + {"date_ranges": pd.date_range("2016-01-01", freq="D", periods=4)}, {"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))}, { "ordered_cats": pd.Series( @@ -750,7 +750,7 @@ def test_read_json_table_orient_raises(self, index_nm): {"bools": [True, False, False, True]}, { "timezones": pd.date_range( - "2016-01-01", freq="d", periods=4, tz="US/Central" + "2016-01-01", freq="D", periods=4, tz="US/Central" ) # added in # GH 35973 }, ], @@ -772,15 +772,15 @@ def test_read_json_table_period_orient(self, index_nm, vals): pd.Index(range(4)), pd.date_range( "2020-08-30", - freq="d", + freq="D", periods=4, )._with_freq(None), pd.date_range( - "2020-08-30", freq="d", periods=4, tz="US/Central" + "2020-08-30", freq="D", periods=4, tz="US/Central" )._with_freq(None), pd.MultiIndex.from_product( [ - pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"), + pd.date_range("2020-08-30", freq="D", periods=2, tz="US/Central"), ["x", "y"], ], ), @@ -790,10 +790,10 @@ def test_read_json_table_period_orient(self, index_nm, vals): "vals", [ {"floats": [1.1, 2.2, 3.3, 4.4]}, - {"dates": pd.date_range("2020-08-30", freq="d", periods=4)}, + {"dates": pd.date_range("2020-08-30", freq="D", periods=4)}, { "timezones": pd.date_range( - "2020-08-30", freq="d", periods=4, tz="Europe/London" + "2020-08-30", freq="D", periods=4, tz="Europe/London" ) }, ], @@ -810,12 +810,12 @@ def test_comprehensive(self): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), # 'D': pd.timedelta_range('1h', periods=4, freq='min'), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.1, 2.2, 3.3, 4.4], - "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), + "H": pd.date_range("2016-01-01", freq="D", periods=4, tz="US/Central"), "I": [True, False, False, True], }, index=pd.Index(range(4), name="idx"), diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b53957a7e77d1..a34c0adc69821 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -792,7 +792,7 @@ def test_frame_from_json_precise_float(self): def test_typ(self): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64") - result = read_json(StringIO(s.to_json()), typ=None) + result = read_json(StringIO(s.to_json()), typ="series") tm.assert_series_equal(result, s) def test_reconstruction_index(self): @@ -1610,6 +1610,13 @@ def test_to_json_from_json_columns_dtypes(self, orient): ) tm.assert_frame_equal(result, expected) + def test_to_json_with_index_as_a_column_name(self): + df = DataFrame(data={"index": [1, 2], "a": [2, 3]}) + with pytest.raises( + ValueError, match="Overlapping names between the index and columns" + ): + df.to_json(orient="table") + @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345 diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index d96ccb4b94cc2..3c843479b446a 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -165,11 +165,11 @@ def test_readjson_chunks_series(request, engine): s = pd.Series({"A": 1, "B": 2}) strio = StringIO(s.to_json(lines=True, orient="records")) - unchunked = read_json(strio, lines=True, typ="Series", engine=engine) + unchunked = read_json(strio, lines=True, typ="series", engine=engine) strio = StringIO(s.to_json(lines=True, orient="records")) with read_json( - strio, lines=True, typ="Series", chunksize=1, engine=engine + strio, lines=True, typ="series", chunksize=1, engine=engine ) as reader: chunked = pd.concat(reader) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 8e05a8e6fc5d8..62118f1c82ebb 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -10,7 +10,6 @@ import dateutil import numpy as np import pytest -import pytz import pandas._libs.json as ujson from pandas.compat import IS64 @@ -370,6 +369,7 @@ def test_encode_time_conversion_basic(self, test): def test_encode_time_conversion_pytz(self): # see gh-11473: to_json segfaults with timezone-aware datetimes + pytz = pytest.importorskip("pytz") test = datetime.time(10, 12, 15, 343243, pytz.utc) output = ujson.ujson_dumps(test) expected = f'"{test.isoformat()}"' diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py index 091edb67f6e19..668aab05b9fa4 100644 --- a/pandas/tests/io/parser/common/test_iterator.py +++ b/pandas/tests/io/parser/common/test_iterator.py @@ -98,6 +98,31 @@ def test_iterator_stop_on_chunksize(all_parsers): tm.assert_frame_equal(concat(result), expected) +def test_nrows_iterator_without_chunksize(all_parsers): + # GH 59079 + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), iterator=True, nrows=2) + return + + with parser.read_csv(StringIO(data), iterator=True, nrows=2) as reader: + result = reader.get_chunk() + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["foo", "bar"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] ) diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index 7a72e66996d43..803114723bc74 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -26,7 +26,7 @@ def custom_dialect(): "escapechar": "~", "delimiter": ":", "skipinitialspace": False, - "quotechar": "~", + "quotechar": "`", "quoting": 3, } return dialect_name, dialect_kwargs diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index f63cc3d56bf89..4ccfa8e81e883 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -75,6 +75,7 @@ def tips_df(datapath): @pytest.mark.single_cpu +@pytest.mark.network @pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index e9c6c0f5e32d7..ec7e5575b2e7d 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -12,7 +12,6 @@ import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -217,6 +216,7 @@ def test_parse_tz_aware(all_parsers): {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) if parser.engine == "pyarrow": + pytz = pytest.importorskip("pytz") expected_tz = pytz.utc else: expected_tz = timezone.utc diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index e33ddaf3b81f0..ba108370a4a92 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -317,3 +317,14 @@ def test_read_infer_string(tmp_path, setup_path): columns=Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) + + +def test_hdfstore_read_datetime64_unit_s(tmp_path, setup_path): + # GH 59004 + df_s = DataFrame(["2001-01-01", "2002-02-02"], dtype="datetime64[s]") + path = tmp_path / setup_path + with HDFStore(path, mode="w") as store: + store.put("df_s", df_s) + with HDFStore(path, mode="r") as store: + df_fromstore = store.get("df_s") + tm.assert_frame_equal(df_s, df_fromstore) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 51ee289c8e27a..3ad05cec3bca3 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -236,8 +236,10 @@ def test_table_values_dtypes_roundtrip(setup_path): df1["float322"] = 1.0 df1["float322"] = df1["float322"].astype("float32") df1["bool"] = df1["float32"] > 0 - df1["time1"] = Timestamp("20130101") - df1["time2"] = Timestamp("20130102") + df1["time_s_1"] = Timestamp("20130101") + df1["time_s_2"] = Timestamp("20130101 00:00:00") + df1["time_ms"] = Timestamp("20130101 00:00:00.000") + df1["time_ns"] = Timestamp("20130102 00:00:00.000000000") store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() @@ -252,7 +254,9 @@ def test_table_values_dtypes_roundtrip(setup_path): "int8": 1, "int64": 1, "object": 1, - "datetime64[ns]": 2, + "datetime64[s]": 2, + "datetime64[ms]": 1, + "datetime64[ns]": 1, }, name="count", ) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index fc5df6d9babcb..62f234ec2db4a 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -30,9 +30,9 @@ def data_test_ix(request, dirpath): fname = os.path.join(dirpath, f"test_sas7bdat_{i}.csv") df = pd.read_csv(fname) epoch = datetime(1960, 1, 1) - t1 = pd.to_timedelta(df["Column4"], unit="d") + t1 = pd.to_timedelta(df["Column4"], unit="D") df["Column4"] = (epoch + t1).astype("M8[s]") - t2 = pd.to_timedelta(df["Column12"], unit="d") + t2 = pd.to_timedelta(df["Column12"], unit="D") df["Column12"] = (epoch + t2).astype("M8[s]") for k in range(df.shape[1]): col = df.iloc[:, k] diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e4b4d3a82669d..26bb2be73838a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -474,7 +474,10 @@ def test_warning_missing_utf_bom(self, encoding, compression_): df.to_csv(path, compression=compression_, encoding=encoding) # reading should fail (otherwise we wouldn't need the warning) - msg = r"UTF-\d+ stream does not start with BOM" + msg = ( + r"UTF-\d+ stream does not start with BOM|" + r"'utf-\d+' codec can't decode byte" + ) with pytest.raises(UnicodeError, match=msg): pd.read_csv(path, compression=compression_, encoding=encoding) @@ -552,7 +555,7 @@ def test_explicit_encoding(io_class, mode, msg): expected.to_csv(buffer, mode=f"w{mode}") -@pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"]) +@pytest.mark.parametrize("encoding_errors", ["strict", "replace"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_encoding_errors(encoding_errors, format): # GH39450 @@ -587,6 +590,17 @@ def test_encoding_errors(encoding_errors, format): tm.assert_frame_equal(df, expected) +@pytest.mark.parametrize("encoding_errors", [0, None]) +def test_encoding_errors_badtype(encoding_errors): + # GH 59075 + content = StringIO("A,B\n1,2\n3,4\n") + reader = partial(pd.read_csv, encoding_errors=encoding_errors) + expected_error = "encoding_errors must be a string, got " + expected_error += f"{type(encoding_errors).__name__}" + with pytest.raises(ValueError, match=expected_error): + reader(content) + + def test_bad_encdoing_errors(): # GH 39777 with tm.ensure_clean() as path: diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 00082be7e07e8..efc3e71564260 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -231,7 +231,7 @@ def test_with_missing_lzma(): @pytest.mark.single_cpu def test_with_missing_lzma_runtime(): - """Tests if RuntimeError is hit when calling lzma without + """Tests if ModuleNotFoundError is hit when calling lzma without having the module available. """ code = textwrap.dedent( @@ -241,7 +241,7 @@ def test_with_missing_lzma_runtime(): sys.modules['lzma'] = None import pandas as pd df = pd.DataFrame() - with pytest.raises(RuntimeError, match='lzma module'): + with pytest.raises(ModuleNotFoundError, match='import of lzma'): df.to_csv('foo.csv', compression='xz') """ ) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 893728748f276..dc82994bcbc7f 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -1,5 +1,7 @@ """test feather-format compat""" +import zoneinfo + import numpy as np import pytest @@ -62,6 +64,7 @@ def test_error(self): self.check_error_on_write(obj, ValueError, msg) def test_basic(self): + tz = zoneinfo.ZoneInfo("US/Eastern") df = pd.DataFrame( { "string": list("abc"), @@ -76,7 +79,7 @@ def test_basic(self): list(pd.date_range("20130101", periods=3)), freq=None ), "dttz": pd.DatetimeIndex( - list(pd.date_range("20130101", periods=3, tz="US/Eastern")), + list(pd.date_range("20130101", periods=3, tz=tz)), freq=None, ), "dt_with_null": [ @@ -93,7 +96,7 @@ def test_basic(self): df["timedeltas"] = pd.timedelta_range("1 day", periods=3) df["intervals"] = pd.interval_range(0, 3, 3) - assert df.dttz.dtype.tz.zone == "US/Eastern" + assert df.dttz.dtype.tz.key == "US/Eastern" expected = df.copy() expected.loc[1, "bool_with_null"] = None diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index af492b967bc1d..930df8abea30f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -14,6 +14,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, + pa_version_under17p0, ) import pandas as pd @@ -1033,7 +1034,9 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) - @pytest.mark.xfail(reason="pa.pandas_compat passes 'datetime64' to .astype") + @pytest.mark.xfail( + pa_version_under17p0, reason="pa.pandas_compat passes 'datetime64' to .astype" + ) def test_columns_dtypes_not_invalid(self, pa): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) @@ -1129,9 +1132,11 @@ def test_infer_string_large_string_type(self, tmp_path, pa): class TestParquetFastParquet(Base): @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") def test_basic(self, fp, df_full): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") df = df_full - dti = pd.date_range("20130101", periods=3, tz="US/Eastern") + dti = pd.date_range("20130101", periods=3, tz=tz) dti = dti._with_freq(None) # freq doesn't round-trip df["datetime_tz"] = dti df["timedelta"] = pd.timedelta_range("1 day", periods=3) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 1420e24858ffb..5fe0f1265edff 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -13,7 +13,6 @@ from __future__ import annotations -from array import array import bz2 import datetime import functools @@ -32,12 +31,8 @@ import numpy as np import pytest -from pandas.compat import ( - get_lzma_file, - is_platform_little_endian, -) +from pandas.compat import is_platform_little_endian from pandas.compat._optional import import_optional_dependency -from pandas.compat.compressors import flatten_buffer import pandas as pd from pandas import ( @@ -81,36 +76,8 @@ def compare_element(result, expected, typ): # --------------------- -@pytest.mark.parametrize( - "data", - [ - b"123", - b"123456", - bytearray(b"123"), - memoryview(b"123"), - pickle.PickleBuffer(b"123"), - array("I", [1, 2, 3]), - memoryview(b"123456").cast("B", (3, 2)), - memoryview(b"123456").cast("B", (3, 2))[::2], - np.arange(12).reshape((3, 4), order="C"), - np.arange(12).reshape((3, 4), order="F"), - np.arange(12).reshape((3, 4), order="C")[:, ::2], - ], -) -def test_flatten_buffer(data): - result = flatten_buffer(data) - expected = memoryview(data).tobytes("A") - assert result == expected - if isinstance(data, (bytes, bytearray)): - assert result is data - elif isinstance(result, memoryview): - assert result.ndim == 1 - assert result.format == "B" - assert result.contiguous - assert result.shape == (result.nbytes,) - - def test_pickles(datapath): + pytest.importorskip("pytz") if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") @@ -261,7 +228,9 @@ def compress_file(self, src_path, dest_path, compression): tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path)) tar.addfile(tarinfo, fh) elif compression == "xz": - f = get_lzma_file()(dest_path, "w") + import lzma + + f = lzma.LZMAFile(dest_path, "w") elif compression == "zstd": f = import_optional_dependency("zstandard").open(dest_path, "wb") else: diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 357e6129dd8f1..4454607606395 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1038,7 +1038,7 @@ def test_utf16_encoding(xml_baby_names, parser): UnicodeError, match=( "UTF-16 stream does not start with BOM|" - "'utf-16-le' codec can't decode byte" + "'utf-16(-le)?' codec can't decode byte" ), ): read_xml(xml_baby_names, encoding="UTF-16", parser=parser) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index e809bd33610f1..b381c4fce8430 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1120,7 +1120,7 @@ def test_boxplot_return_type_invalid_type(self, return_type): def test_kde_df(self): pytest.importorskip("scipy") - df = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) ax = _check_plot_works(df.plot, kind="kde") expected = [pprint_thing(c) for c in df.columns] _check_legend_labels(ax, labels=expected) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index a9135ee583d91..1275f3d6f7d6d 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1543,7 +1543,7 @@ def test_format_timedelta_ticks_wide(self): "9 days 06:13:20", ] - rng = timedelta_range("0", periods=10, freq="1 d") + rng = timedelta_range("0", periods=10, freq="1 D") df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 3)), rng) _, ax = mpl.pyplot.subplots() ax = df.plot(fontsize=2, ax=ax) @@ -1562,7 +1562,7 @@ def test_timedelta_plot(self): def test_timedelta_long_period(self): # test long period - index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 d") + index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 D") s = Series(np.random.default_rng(2).standard_normal(len(index)), index) _, ax = mpl.pyplot.subplots() _check_plot_works(s.plot, ax=ax) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index f4ea6b1d3f3de..b2d9f6c0e3eb0 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -436,7 +436,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): empty_series_dti = Series([], index, dtype) with tm.assert_produces_warning(warn, match=msg): - rs = empty_series_dti.resample("d", group_keys=False) + rs = empty_series_dti.resample("D", group_keys=False) try: getattr(rs, resample_method)() except DataError: diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 7f37ca6831faa..dc2ddcc70828f 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1,9 +1,9 @@ from datetime import datetime from functools import partial +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs import lib from pandas._typing import DatetimeNaTType @@ -239,7 +239,9 @@ def _ohlc(group): def test_resample_how_callables(unit): # GH#7929 data = np.arange(5, dtype=np.int64) - ind = date_range(start="2014-01-01", periods=len(data), freq="d").as_unit(unit) + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + ind = date_range(start="2014-01-01", periods=len(data), freq="d").as_unit(unit) df = DataFrame({"A": data, "B": data}, index=ind) def fn(x, a=1): @@ -334,7 +336,9 @@ def test_resample_basic_from_daily(unit): s = Series(np.random.default_rng(2).random(len(dti)), dti) # to weekly - result = s.resample("w-sun").last() + msg = "'w-sun' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.resample("w-sun").last() assert len(result) == 3 assert (result.index.dayofweek == [6, 6, 6]).all() @@ -1190,7 +1194,9 @@ def test_anchored_lowercase_buglet(unit): dates = date_range("4/16/2012 20:00", periods=50000, freq="s").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates) # it works! - ts.resample("d").mean() + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + ts.resample("d").mean() def test_upsample_apply_functions(unit): @@ -1531,9 +1537,9 @@ def test_groupby_with_dst_time_change(unit): ) df = DataFrame([1, 2], index=index) - result = df.groupby(Grouper(freq="1d")).last() + result = df.groupby(Grouper(freq="1D")).last() expected_index_values = date_range( - "2016-11-02", "2016-11-24", freq="d", tz="America/Chicago" + "2016-11-02", "2016-11-24", freq="D", tz="America/Chicago" ).as_unit(unit) index = DatetimeIndex(expected_index_values) @@ -1655,13 +1661,13 @@ def test_resample_dst_anchor2(unit): def test_downsample_across_dst(unit): # GH 8531 - tz = pytz.timezone("Europe/Berlin") + tz = zoneinfo.ZoneInfo("Europe/Berlin") dt = datetime(2014, 10, 26) - dates = date_range(tz.localize(dt), periods=4, freq="2h").as_unit(unit) + dates = date_range(dt.astimezone(tz), periods=4, freq="2h").as_unit(unit) result = Series(5, index=dates).resample("h").mean() expected = Series( [5.0, np.nan] * 3 + [5.0], - index=date_range(tz.localize(dt), periods=7, freq="h").as_unit(unit), + index=date_range(dt.astimezone(tz), periods=7, freq="h").as_unit(unit), ) tm.assert_series_equal(result, expected) @@ -2018,7 +2024,7 @@ def test_resample_empty_series_with_tz(): def test_resample_M_Q_Y_raises(freq): msg = f"Invalid frequency: {freq}" - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) with pytest.raises(ValueError, match=msg): s.resample(freq).mean() @@ -2027,11 +2033,32 @@ def test_resample_M_Q_Y_raises(freq): def test_resample_BM_BQ_raises(freq): msg = f"Invalid frequency: {freq}" - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) with pytest.raises(ValueError, match=msg): s.resample(freq).mean() +@pytest.mark.parametrize( + "freq,freq_depr,data", + [ + ("1W-SUN", "1w-sun", ["2013-01-06"]), + ("1D", "1d", ["2013-01-01"]), + ("1B", "1b", ["2013-01-01"]), + ("1C", "1c", ["2013-01-01"]), + ], +) +def test_resample_depr_lowercase_frequency(freq, freq_depr, data): + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + + s = Series(range(5), index=date_range("20130101", freq="h", periods=5)) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.resample(freq_depr).mean() + + exp_dti = DatetimeIndex(data=data, dtype="datetime64[ns]", freq=freq) + expected = Series(2.0, index=exp_dti) + tm.assert_series_equal(result, expected) + + def test_resample_ms_closed_right(unit): # https://github.com/pandas-dev/pandas/issues/55271 dti = date_range(start="2020-01-31", freq="1min", periods=6000, unit=unit) @@ -2129,6 +2156,6 @@ def test_arrow_timestamp_resample(tz): def test_resample_A_raises(freq): msg = f"Invalid frequency: {freq[1:]}" - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) with pytest.raises(ValueError, match=msg): s.resample(freq).mean() diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index a4e27ad46c59c..e17529dfab00c 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1,11 +1,14 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import re import warnings +import zoneinfo import dateutil import numpy as np import pytest -import pytz from pandas._libs.tslibs.ccalendar import ( DAYS, @@ -161,12 +164,12 @@ def test_basic_downsample(self, simple_period_range_series): ("Y-DEC", ""), ("Q-MAR", ""), ("M", ""), - ("w-thu", ""), + ("W-THU", ""), ], ) def test_not_subperiod(self, simple_period_range_series, rule, expected_error_msg): # These are incompatible period rules for resampling - ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="w-wed") + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="W-WED") msg = ( "Frequency cannot be resampled to " f"{expected_error_msg}, as they are not sub or super periods" @@ -304,7 +307,7 @@ def test_resample_incompat_freq(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("America/Los_Angeles"), + zoneinfo.ZoneInfo("America/Los_Angeles"), dateutil.tz.gettz("America/Los_Angeles"), ], ) @@ -312,9 +315,13 @@ def test_with_local_timezone(self, tz): # see gh-5430 local_timezone = tz - start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=pytz.utc) + start = datetime( + year=2013, month=11, day=1, hour=0, minute=0, tzinfo=timezone.utc + ) # 1 day later - end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) + end = datetime( + year=2013, month=11, day=2, hour=0, minute=0, tzinfo=timezone.utc + ) index = date_range(start, end, freq="h", name="idx") @@ -336,7 +343,7 @@ def test_with_local_timezone(self, tz): @pytest.mark.parametrize( "tz", [ - pytz.timezone("America/Los_Angeles"), + zoneinfo.ZoneInfo("America/Los_Angeles"), dateutil.tz.gettz("America/Los_Angeles"), ], ) @@ -353,8 +360,6 @@ def test_resample_with_tz(self, tz, unit): index=exp_dti, ) tm.assert_series_equal(result, expected) - # Especially assert that the timezone is LMT for pytz - assert result.index.tz == tz def test_resample_nonexistent_time_bin_edge(self): # GH 19375 diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index bf1f6bd34b171..a8fb1b392322d 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -732,7 +732,7 @@ def test_agg_with_datetime_index_list_agg_func(col_name): ), columns=[col_name], ) - result = df.resample("1d").aggregate(["mean"]) + result = df.resample("1D").aggregate(["mean"]) expected = DataFrame( [47.5, 143.5, 195.5], index=date_range(start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"), diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 2646106b9b97c..f694b90a707c7 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -193,7 +193,7 @@ def test_aggregate_nth(): ) def test_resample_entirely_nat_window(method, method_args, unit): ser = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4)) - result = methodcaller(method, **method_args)(ser.resample("2d")) + result = methodcaller(method, **method_args)(ser.resample("2D")) exp_dti = pd.DatetimeIndex(["2017-01-01", "2017-01-03"], dtype="M8[ns]", freq="2D") expected = Series([0.0, unit], index=exp_dti) @@ -372,7 +372,7 @@ def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): for df in dfs: result = df.groupby("volume").apply( - lambda x: x.resample("1d").interpolate(method="linear"), + lambda x: x.resample("1D").interpolate(method="linear"), include_groups=False, ) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index afafe8f6ab264..d0ff950e7985f 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -1,3 +1,5 @@ +import zoneinfo + import numpy as np import pytest @@ -353,14 +355,15 @@ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) # different tz - dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") + tz_diff = zoneinfo.ZoneInfo("US/Hawaii") + dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz_diff) exp = Index( [ pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2011-01-02", tz=tz), - pd.Timestamp("2012-01-01", tz="US/Pacific"), - pd.Timestamp("2012-01-02", tz="US/Pacific"), + pd.Timestamp("2012-01-01", tz=tz_diff), + pd.Timestamp("2012-01-02", tz=tz_diff), ], dtype=object, ) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 0a5989e3c82e6..0ab4d08db7cc9 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1367,8 +1367,8 @@ def test_merge_two_empty_df_no_division_error(self): ), ), ( - TimedeltaIndex(["1d", "2d", "3d"]), - TimedeltaIndex(["1d", "2d", "3d", pd.NaT, pd.NaT, pd.NaT]), + TimedeltaIndex(["1D", "2D", "3D"]), + TimedeltaIndex(["1D", "2D", "3D", pd.NaT, pd.NaT, pd.NaT]), ), ], ) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 4fc57c14ec4c3..bd364de26a3c4 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -2,7 +2,6 @@ import numpy as np import pytest -import pytz import pandas.util._test_decorators as td @@ -2071,7 +2070,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-02"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value1": np.arange(5), @@ -2083,7 +2082,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-01"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value2": list("ABCDE"), @@ -2097,7 +2096,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-02"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value1": np.arange(5), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 728becc76b71f..2872b1e29d629 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2769,3 +2769,17 @@ def test_unstack_copy(self, m): result = df.unstack(sort=False) result.iloc[0, 0] = -1 tm.assert_frame_equal(df, df_orig) + + def test_pivot_empty_with_datetime(self): + # GH#59126 + df = DataFrame( + { + "timestamp": Series([], dtype=pd.DatetimeTZDtype(tz="UTC")), + "category": Series([], dtype=str), + "value": Series([], dtype=str), + } + ) + df_pivoted = df.pivot_table( + index="category", columns="value", values="timestamp" + ) + assert df_pivoted.empty diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 90d4a7d0cc23b..0ae5389a3e9b5 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -111,8 +111,7 @@ def test_conv_annual(self): assert ival_A.asfreq("B", "E") == ival_A_to_B_end assert ival_A.asfreq("D", "s") == ival_A_to_D_start assert ival_A.asfreq("D", "E") == ival_A_to_D_end - msg_depr = "'H' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg_depr): + with pytest.raises(ValueError, match=msg): assert ival_A.asfreq("H", "s") == ival_A_to_H_start assert ival_A.asfreq("H", "E") == ival_A_to_H_end assert ival_A.asfreq("min", "s") == ival_A_to_T_start @@ -120,8 +119,6 @@ def test_conv_annual(self): with pytest.raises(ValueError, match=msg): assert ival_A.asfreq("T", "s") == ival_A_to_T_start assert ival_A.asfreq("T", "E") == ival_A_to_T_end - msg_depr = "'S' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg_depr): assert ival_A.asfreq("S", "S") == ival_A_to_S_start assert ival_A.asfreq("S", "E") == ival_A_to_S_end diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 49bd48b40e67a..fe51817a78be8 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -117,7 +117,9 @@ def test_construction(self): i2 = Period("3/1/2005", freq="D") assert i1 == i2 - i3 = Period(year=2005, month=3, day=1, freq="d") + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + i3 = Period(year=2005, month=3, day=1, freq="d") assert i1 == i3 i1 = Period("2007-01-01 09:00:00.001") @@ -613,6 +615,25 @@ def test_period_large_ordinal(self, hour): p = Period(ordinal=2562048 + hour, freq="1h") assert p.hour == hour + @pytest.mark.filterwarnings( + "ignore:Period with BDay freq is deprecated:FutureWarning" + ) + @pytest.mark.parametrize( + "freq,freq_depr", + [("2W", "2w"), ("2W-FRI", "2w-fri"), ("2D", "2d"), ("2B", "2b")], + ) + def test_period_deprecated_lowercase_freq(self, freq, freq_depr): + # GH#58998 + msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Period("2016-03-01 09:00", freq=freq_depr) + + expected = Period("2016-03-01 09:00", freq=freq) + assert result == expected + class TestPeriodMethods: def test_round_trip(self): diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 131be7a77f2e5..b20df43dd49a6 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -3,10 +3,10 @@ timedelta, ) import operator +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs.tslibs import iNaT from pandas.compat.numpy import np_version_gte1p24p3 @@ -361,7 +361,7 @@ def test_nat_doc_strings(compare): (Timestamp("2014-01-01"), "timestamp"), (Timestamp("2014-01-01", tz="UTC"), "timestamp"), (Timestamp("2014-01-01", tz="US/Eastern"), "timestamp"), - (pytz.timezone("Asia/Tokyo").localize(datetime(2014, 1, 1)), "timestamp"), + (datetime(2014, 1, 1).astimezone(zoneinfo.ZoneInfo("Asia/Tokyo")), "timestamp"), ], ) def test_nat_arithmetic_scalar(op_name, value, val_type): diff --git a/pandas/tests/scalar/timedelta/methods/test_round.py b/pandas/tests/scalar/timedelta/methods/test_round.py index 082c36999e06f..96cb1c07d2b76 100644 --- a/pandas/tests/scalar/timedelta/methods/test_round.py +++ b/pandas/tests/scalar/timedelta/methods/test_round.py @@ -38,7 +38,7 @@ class TestTimedeltaRound: ("min", "1 days 02:35:00", "-1 days 02:35:00"), ("12min", "1 days 02:36:00", "-1 days 02:36:00"), ("h", "1 days 03:00:00", "-1 days 03:00:00"), - ("d", "1 days", "-1 days"), + ("D", "1 days", "-1 days"), ], ) def test_round(self, freq, s1, s2): diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index efeca375affbb..2183a5851ea9c 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -79,7 +79,7 @@ def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_datetimelike_scalar(self, op): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, datetime(2016, 1, 1)) if op is operator.add: @@ -111,7 +111,7 @@ def test_td_add_timestamp_overflow(self): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_td(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, Timedelta(days=10)) assert isinstance(result, Timedelta) @@ -119,35 +119,35 @@ def test_td_add_td(self, op): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_pytimedelta(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, timedelta(days=9)) assert isinstance(result, Timedelta) assert result == Timedelta(days=19) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_timedelta64(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, np.timedelta64(-4, "D")) assert isinstance(result, Timedelta) assert result == Timedelta(days=6) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_offset(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, offsets.Hour(6)) assert isinstance(result, Timedelta) assert result == Timedelta(days=10, hours=6) def test_td_sub_td(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td assert isinstance(result, Timedelta) assert result == expected def test_td_sub_pytimedelta(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_pytimedelta() @@ -159,7 +159,7 @@ def test_td_sub_pytimedelta(self): assert result == expected def test_td_sub_timedelta64(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_timedelta64() @@ -172,12 +172,12 @@ def test_td_sub_timedelta64(self): def test_td_sub_nat(self): # In this context pd.NaT is treated as timedelta-like - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - NaT assert result is NaT def test_td_sub_td64_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") td_nat = np.timedelta64("NaT") result = td - td_nat @@ -187,13 +187,13 @@ def test_td_sub_td64_nat(self): assert result is NaT def test_td_sub_offset(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - offsets.Hour(1) assert isinstance(result, Timedelta) assert result == Timedelta(239, unit="h") def test_td_add_sub_numeric_raises(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") msg = "unsupported operand type" for other in [2, 2.0, np.int64(2), np.float64(2)]: with pytest.raises(TypeError, match=msg): @@ -234,7 +234,7 @@ def test_td_add_sub_int_ndarray(self): other - td def test_td_rsub_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = NaT - td assert result is NaT @@ -242,7 +242,7 @@ def test_td_rsub_nat(self): assert result is NaT def test_td_rsub_offset(self): - result = offsets.Hour(1) - Timedelta(10, unit="d") + result = offsets.Hour(1) - Timedelta(10, unit="D") assert isinstance(result, Timedelta) assert result == Timedelta(-239, unit="h") @@ -362,7 +362,7 @@ class TestTimedeltaMultiplicationDivision: @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nat(self, op, td_nat): # GH#19819 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") typs = "|".join(["numpy.timedelta64", "NaTType", "Timedelta"]) msg = "|".join( [ @@ -377,7 +377,7 @@ def test_td_mul_nat(self, op, td_nat): @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nan(self, op, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, nan) assert result is NaT @@ -419,7 +419,7 @@ def test_td_mul_numeric_ndarray(self): def test_td_mul_numeric_ndarray_0d(self): td = Timedelta("1 day") - other = np.array(2) + other = np.array(2, dtype=np.int64) assert other.ndim == 0 expected = Timedelta("2 days") @@ -449,7 +449,7 @@ def test_td_mul_td64_ndarray_invalid(self): def test_td_div_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / offsets.Hour(1) assert result == 240 @@ -480,7 +480,7 @@ def test_td_div_td64_non_nano(self): def test_td_div_numeric_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / 2 assert isinstance(result, Timedelta) @@ -500,7 +500,7 @@ def test_td_div_numeric_scalar(self): ) def test_td_div_nan(self, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / nan assert result is NaT @@ -532,7 +532,7 @@ def test_td_div_ndarray_0d(self): def test_td_rdiv_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = offsets.Hour(1) / td assert result == 1 / 240.0 @@ -540,7 +540,7 @@ def test_td_rdiv_timedeltalike_scalar(self): def test_td_rdiv_na_scalar(self): # GH#31869 None gets cast to NaT - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = NaT / td assert np.isnan(result) @@ -560,7 +560,7 @@ def test_td_rdiv_na_scalar(self): np.nan / td def test_td_rdiv_ndarray(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") arr = np.array([td], dtype=object) result = arr / td @@ -583,7 +583,7 @@ def test_td_rdiv_ndarray(self): arr / td def test_td_rdiv_ndarray_0d(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") arr = np.array(td.asm8) @@ -623,6 +623,7 @@ def test_td_floordiv_invalid_scalar(self): [ r"Invalid dtype datetime64\[D\] for __floordiv__", "'dtype' is an invalid keyword argument for this function", + "this function got an unexpected keyword argument 'dtype'", r"ufunc '?floor_divide'? cannot use operands with types", ] ) diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 5509216f4daf4..e029dfc3b2703 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -32,20 +32,31 @@ def test_unit_m_y_raises(self, unit): with pytest.raises(ValueError, match=msg): to_timedelta([1, 2], unit) - @pytest.mark.parametrize("unit", ["h", "s"]) - def test_units_H_S_deprecated(self, unit): + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): # GH#52536 - msg = f"'{unit.upper()}' is deprecated and will be removed in a future version." + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." expected = Timedelta(1, unit=unit) with tm.assert_produces_warning(FutureWarning, match=msg): - result = Timedelta(1, unit=unit.upper()) + result = Timedelta(1, unit=unit_depr) tm.assert_equal(result, expected) @pytest.mark.parametrize( "unit, np_unit", - [(value, "W") for value in ["W", "w"]] - + [(value, "D") for value in ["D", "d", "days", "day", "Days", "Day"]] + [("W", "W")] + + [(value, "D") for value in ["D", "days", "day", "Days", "Day"]] + [ (value, "m") for value in [ @@ -78,7 +89,6 @@ def test_units_H_S_deprecated(self, unit): "millisecond", "milli", "millis", - "MS", "Milliseconds", "Millisecond", "Milli", @@ -93,7 +103,6 @@ def test_units_H_S_deprecated(self, unit): "microsecond", "micro", "micros", - "US", "Microseconds", "Microsecond", "Micro", @@ -108,7 +117,6 @@ def test_units_H_S_deprecated(self, unit): "nanosecond", "nano", "nanos", - "NS", "Nanoseconds", "Nanosecond", "Nano", @@ -250,8 +258,8 @@ def test_from_tick_reso(): def test_construction(): expected = np.timedelta64(10, "D").astype("m8[ns]").view("i8") - assert Timedelta(10, unit="d")._value == expected - assert Timedelta(10.0, unit="d")._value == expected + assert Timedelta(10, unit="D")._value == expected + assert Timedelta(10.0, unit="D")._value == expected assert Timedelta("10 days")._value == expected assert Timedelta(days=10)._value == expected assert Timedelta(days=10.0)._value == expected diff --git a/pandas/tests/scalar/timedelta/test_formats.py b/pandas/tests/scalar/timedelta/test_formats.py index e1b0076d5b7b9..1aafeec2ceed5 100644 --- a/pandas/tests/scalar/timedelta/test_formats.py +++ b/pandas/tests/scalar/timedelta/test_formats.py @@ -6,7 +6,7 @@ @pytest.mark.parametrize( "td, expected_repr", [ - (Timedelta(10, unit="d"), "Timedelta('10 days 00:00:00')"), + (Timedelta(10, unit="D"), "Timedelta('10 days 00:00:00')"), (Timedelta(10, unit="s"), "Timedelta('0 days 00:00:10')"), (Timedelta(10, unit="ms"), "Timedelta('0 days 00:00:00.010000')"), (Timedelta(-10, unit="ms"), "Timedelta('-1 days +23:59:59.990000')"), diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 01e7ba52e58aa..8be2ec846a6d9 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -280,7 +280,7 @@ def test_timedelta_class_min_max_resolution(): class TestTimedeltaUnaryOps: def test_invert(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") msg = "bad operand type for unary ~" with pytest.raises(TypeError, match=msg): @@ -295,17 +295,17 @@ def test_invert(self): ~(td.to_timedelta64()) def test_unary_ops(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") # __neg__, __pos__ - assert -td == Timedelta(-10, unit="d") - assert -td == Timedelta("-10d") - assert +td == Timedelta(10, unit="d") + assert -td == Timedelta(-10, unit="D") + assert -td == Timedelta("-10D") + assert +td == Timedelta(10, unit="D") # __abs__, __abs__(__neg__) assert abs(td) == td assert abs(-td) == td - assert abs(-td) == Timedelta("10d") + assert abs(-td) == Timedelta("10D") class TestTimedeltas: @@ -334,7 +334,7 @@ def test_total_seconds_scalar(self): assert np.isnan(rng.total_seconds()) def test_conversion(self): - for td in [Timedelta(10, unit="d"), Timedelta("1 days, 10:11:12.012345")]: + for td in [Timedelta(10, unit="D"), Timedelta("1 days, 10:11:12.012345")]: pydt = td.to_pytimedelta() assert td == Timedelta(pydt) assert td == pydt @@ -450,7 +450,7 @@ def test_numeric_conversions(self): assert Timedelta(10, unit="us") == np.timedelta64(10, "us") assert Timedelta(10, unit="ms") == np.timedelta64(10, "ms") assert Timedelta(10, unit="s") == np.timedelta64(10, "s") - assert Timedelta(10, unit="d") == np.timedelta64(10, "D") + assert Timedelta(10, unit="D") == np.timedelta64(10, "D") def test_timedelta_conversions(self): assert Timedelta(timedelta(seconds=1)) == np.timedelta64(1, "s").astype( @@ -474,7 +474,7 @@ def test_to_numpy_alias(self): td.to_numpy(copy=True) def test_identity(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") assert isinstance(td, Timedelta) assert isinstance(td, timedelta) @@ -489,7 +489,10 @@ def conv(v): assert Timedelta("1000") == np.timedelta64(1000, "ns") assert Timedelta("1000ns") == np.timedelta64(1000, "ns") - assert Timedelta("1000NS") == np.timedelta64(1000, "ns") + + msg = "'NS' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Timedelta("1000NS") == np.timedelta64(1000, "ns") assert Timedelta("10us") == np.timedelta64(10000, "ns") assert Timedelta("100us") == np.timedelta64(100000, "ns") @@ -508,8 +511,10 @@ def conv(v): assert Timedelta("100s") == np.timedelta64(100000000000, "ns") assert Timedelta("1000s") == np.timedelta64(1000000000000, "ns") - assert Timedelta("1d") == conv(np.timedelta64(1, "D")) - assert Timedelta("-1d") == -conv(np.timedelta64(1, "D")) + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Timedelta("1d") == conv(np.timedelta64(1, "D")) + assert Timedelta("-1D") == -conv(np.timedelta64(1, "D")) assert Timedelta("1D") == conv(np.timedelta64(1, "D")) assert Timedelta("10D") == conv(np.timedelta64(10, "D")) assert Timedelta("100D") == conv(np.timedelta64(100, "D")) @@ -663,6 +668,26 @@ def test_resolution_deprecated(self): result = Timedelta.resolution assert result == Timedelta(nanoseconds=1) + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): + # GH#59051 + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Timedelta(1, unit_depr) + assert result == Timedelta(1, unit) + @pytest.mark.parametrize( "value, expected", diff --git a/pandas/tests/scalar/timestamp/methods/test_replace.py b/pandas/tests/scalar/timestamp/methods/test_replace.py index c5169fdff0cd4..f15ea0e485cae 100644 --- a/pandas/tests/scalar/timestamp/methods/test_replace.py +++ b/pandas/tests/scalar/timestamp/methods/test_replace.py @@ -1,9 +1,9 @@ from datetime import datetime +import zoneinfo from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( OutOfBoundsDatetime, @@ -111,8 +111,8 @@ def test_replace_tzinfo_equiv_tz_localize_none(self): @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_replace_tzinfo(self): # GH#15683 - dt = datetime(2016, 3, 27, 1) - tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + dt = datetime(2016, 3, 27, 1, fold=1) + tzinfo = dt.astimezone(zoneinfo.ZoneInfo("Europe/Berlin")).tzinfo result_dt = dt.replace(tzinfo=tzinfo) result_pd = Timestamp(dt).replace(tzinfo=tzinfo) @@ -137,13 +137,16 @@ def test_replace_tzinfo(self): @pytest.mark.parametrize( "tz, normalize", [ - (pytz.timezone("US/Eastern"), lambda x: x.tzinfo.normalize(x)), + ("pytz/US/Eastern", lambda x: x.tzinfo.normalize(x)), (gettz("US/Eastern"), lambda x: x), ], ) def test_replace_across_dst(self, tz, normalize): # GH#18319 check that 1) timezone is correctly normalized and # 2) that hour is not incorrectly changed by this normalization + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) ts_naive = Timestamp("2017-12-03 16:03:30") ts_aware = conversion.localize_pydatetime(ts_naive, tz) diff --git a/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py b/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py index b576317fca8b4..beacaaf04e6b2 100644 --- a/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py +++ b/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py @@ -1,8 +1,8 @@ # NB: This is for the Timestamp.timestamp *method* specifically, not # the Timestamp class in general. +from datetime import timezone import pytest -from pytz import utc from pandas._libs.tslibs import Timestamp from pandas.compat import WASM @@ -18,7 +18,7 @@ def test_timestamp(self, fixed_now_ts): # GH#17329 # tz-naive --> treat it as if it were UTC for purposes of timestamp() ts = fixed_now_ts - uts = ts.replace(tzinfo=utc) + uts = ts.replace(tzinfo=timezone.utc) assert ts.timestamp() == uts.timestamp() tsc = Timestamp("2014-10-11 11:00:01.12345678", tz="US/Central") diff --git a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py index be6ec7dbc24c7..07e57b51a7f1e 100644 --- a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py +++ b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py @@ -3,7 +3,7 @@ timedelta, ) -import pytz +import pytest from pandas._libs.tslibs.timezones import dateutil_gettz as gettz import pandas.util._test_decorators as td @@ -43,6 +43,7 @@ def test_timestamp_to_pydatetime_dateutil(self): assert stamp.tzinfo == dtval.tzinfo def test_timestamp_to_pydatetime_explicit_pytz(self): + pytz = pytest.importorskip("pytz") stamp = Timestamp("20090415", tz=pytz.timezone("US/Eastern")) dtval = stamp.to_pydatetime() assert stamp == dtval diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py index 0786cc58a4f95..90dc8d77608cb 100644 --- a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py +++ b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py @@ -1,5 +1,6 @@ from datetime import timedelta import re +import zoneinfo from dateutil.tz import gettz import pytest @@ -17,68 +18,56 @@ Timestamp, ) -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - class TestTimestampTZLocalize: @pytest.mark.skip_ubsan def test_tz_localize_pushes_out_of_bounds(self): # GH#12677 # tz_localize that pushes away from the boundary is OK + pytz = pytest.importorskip("pytz") msg = ( f"Converting {Timestamp.min.strftime('%Y-%m-%d %H:%M:%S')} " f"underflows past {Timestamp.min}" ) - pac = Timestamp.min.tz_localize("US/Pacific") + pac = Timestamp.min.tz_localize(pytz.timezone("US/Pacific")) assert pac._value > Timestamp.min._value pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.min.tz_localize("Asia/Tokyo") + Timestamp.min.tz_localize(pytz.timezone("Asia/Tokyo")) # tz_localize that pushes away from the boundary is OK msg = ( f"Converting {Timestamp.max.strftime('%Y-%m-%d %H:%M:%S')} " f"overflows past {Timestamp.max}" ) - tokyo = Timestamp.max.tz_localize("Asia/Tokyo") + tokyo = Timestamp.max.tz_localize(pytz.timezone("Asia/Tokyo")) assert tokyo._value < Timestamp.max._value tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.max.tz_localize("US/Pacific") + Timestamp.max.tz_localize(pytz.timezone("US/Pacific")) - def test_tz_localize_ambiguous_bool(self, unit): + @pytest.mark.parametrize( + "tz", + [zoneinfo.ZoneInfo("US/Central"), "dateutil/US/Central", "pytz/US/Central"], + ) + def test_tz_localize_ambiguous_bool(self, unit, tz): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 + if isinstance(tz, str) and tz.startswith("pytz/"): + tz = pytz.timezone(tz.removeprefix("pytz/")) ts = Timestamp("2015-11-01 01:00:03").as_unit(unit) - expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") - expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz=tz) + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz=tz) msg = "Cannot infer dst time from 2015-11-01 01:00:03" with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize("US/Central") + ts.tz_localize(tz) - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize("dateutil/US/Central") - - if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Central") - except KeyError: - # no tzdata - pass - else: - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize(tz) - - result = ts.tz_localize("US/Central", ambiguous=True) + result = ts.tz_localize(tz, ambiguous=True) assert result == expected0 assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - result = ts.tz_localize("US/Central", ambiguous=False) + result = ts.tz_localize(tz, ambiguous=False) assert result == expected1 assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value @@ -205,9 +194,10 @@ def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): def test_tz_localize_ambiguous_compat(self): # validate that pytz and dateutil are compat for dst # when the transition happens + pytz = pytest.importorskip("pytz") naive = Timestamp("2013-10-27 01:00:00") - pytz_zone = "Europe/London" + pytz_zone = pytz.timezone("Europe/London") dateutil_zone = "dateutil/Europe/London" result_pytz = naive.tz_localize(pytz_zone, ambiguous=False) result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=False) @@ -236,13 +226,16 @@ def test_tz_localize_ambiguous_compat(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), - "US/Eastern", + zoneinfo.ZoneInfo("US/Eastern"), "dateutil/US/Eastern", ], ) def test_timestamp_tz_localize(self, tz): + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) stamp = Timestamp("3/11/2012 04:00") result = stamp.tz_localize(tz) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 2d58513989a66..7aa6c6c0496a9 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -7,7 +7,6 @@ from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( OutOfBoundsDatetime, @@ -294,7 +293,7 @@ def test_subtract_different_utc_objects(self, utc_fixture, utc_fixture2): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), "US/Eastern", "dateutil/US/Eastern", @@ -302,7 +301,9 @@ def test_subtract_different_utc_objects(self, utc_fixture, utc_fixture2): ) def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): # GH#1389 - + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) # 4 hours before DST transition stamp = Timestamp("3/10/2012 22:00", tz=tz) diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 4ebdea3733484..39f302c3357de 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -18,7 +18,6 @@ import pytz from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas.compat import PY310 from pandas.errors import OutOfBoundsDatetime from pandas import ( @@ -123,6 +122,7 @@ def test_timestamp_constructor_pytz_fold_raise(self): # Test for GH#25057 # pytz doesn't support fold. Check that we raise # if fold is passed with pytz + pytz = pytest.importorskip("pytz") msg = "pytz timezones do not support fold. Please use dateutil timezones." tz = pytz.timezone("Europe/London") with pytest.raises(ValueError, match=msg): @@ -160,15 +160,13 @@ def test_timestamp_constructor_retain_fold(self, tz, fold): expected = fold assert result == expected - try: - _tzs = [ + @pytest.mark.parametrize( + "tz", + [ "dateutil/Europe/London", zoneinfo.ZoneInfo("Europe/London"), - ] - except zoneinfo.ZoneInfoNotFoundError: - _tzs = ["dateutil/Europe/London"] - - @pytest.mark.parametrize("tz", _tzs) + ], + ) @pytest.mark.parametrize( "ts_input,fold_out", [ @@ -211,11 +209,7 @@ def test_timestamp_constructor_adjust_value_for_fold(self, tz, fold, value_out): class TestTimestampConstructorPositionalAndKeywordSupport: def test_constructor_positional(self): # see GH#10758 - msg = ( - "'NoneType' object cannot be interpreted as an integer" - if PY310 - else "an integer is required" - ) + msg = "'NoneType' object cannot be interpreted as an integer" with pytest.raises(TypeError, match=msg): Timestamp(2000, 1) @@ -565,11 +559,11 @@ def test_constructor(self): timezones = [ (None, 0), ("UTC", 0), - (pytz.utc, 0), + (timezone.utc, 0), ("Asia/Tokyo", 9), ("US/Eastern", -4), ("dateutil/US/Pacific", -7), - (pytz.FixedOffset(-180), -3), + (timezone(timedelta(hours=-3)), -3), (dateutil.tz.tzoffset(None, 18000), 5), ] @@ -622,11 +616,11 @@ def test_constructor_with_stringoffset(self): timezones = [ ("UTC", 0), - (pytz.utc, 0), + (timezone.utc, 0), ("Asia/Tokyo", 9), ("US/Eastern", -4), ("dateutil/US/Pacific", -7), - (pytz.FixedOffset(-180), -3), + (timezone(timedelta(hours=-3)), -3), (dateutil.tz.tzoffset(None, 18000), 5), ] @@ -706,7 +700,7 @@ def test_constructor_invalid_tz(self): msg = "at most one of" with pytest.raises(ValueError, match=msg): - Timestamp("2017-10-22", tzinfo=pytz.utc, tz="UTC") + Timestamp("2017-10-22", tzinfo=timezone.utc, tz="UTC") msg = "Cannot pass a date attribute keyword argument when passing a date string" with pytest.raises(ValueError, match=msg): @@ -719,11 +713,11 @@ def test_constructor_tz_or_tzinfo(self): # GH#17943, GH#17690, GH#5168 stamps = [ Timestamp(year=2017, month=10, day=22, tz="UTC"), - Timestamp(year=2017, month=10, day=22, tzinfo=pytz.utc), - Timestamp(year=2017, month=10, day=22, tz=pytz.utc), - Timestamp(datetime(2017, 10, 22), tzinfo=pytz.utc), + Timestamp(year=2017, month=10, day=22, tzinfo=timezone.utc), + Timestamp(year=2017, month=10, day=22, tz=timezone.utc), + Timestamp(datetime(2017, 10, 22), tzinfo=timezone.utc), Timestamp(datetime(2017, 10, 22), tz="UTC"), - Timestamp(datetime(2017, 10, 22), tz=pytz.utc), + Timestamp(datetime(2017, 10, 22), tz=timezone.utc), ] assert all(ts == stamps[0] for ts in stamps) @@ -898,13 +892,13 @@ def test_construct_timestamp_near_dst(self, offset): def test_construct_with_different_string_format(self, arg): # GH 12064 result = Timestamp(arg) - expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540)) + expected = Timestamp(datetime(2013, 1, 1), tz=timezone(timedelta(hours=9))) assert result == expected @pytest.mark.parametrize("box", [datetime, Timestamp]) def test_raise_tz_and_tzinfo_in_datetime_input(self, box): # GH 23579 - kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": pytz.utc} + kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": timezone.utc} msg = "Cannot pass a datetime or Timestamp" with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tz="US/Pacific") @@ -912,7 +906,7 @@ def test_raise_tz_and_tzinfo_in_datetime_input(self, box): with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) - def test_dont_convert_dateutil_utc_to_pytz_utc(self): + def test_dont_convert_dateutil_utc_to_default_utc(self): result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) expected = Timestamp(datetime(2018, 1, 1)).tz_localize(tzutc()) assert result == expected @@ -996,7 +990,7 @@ def test_timestamp_constructor_near_dst_boundary(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), "US/Eastern", "dateutil/US/Eastern", @@ -1005,7 +999,9 @@ def test_timestamp_constructor_near_dst_boundary(self): def test_timestamp_constructed_by_date_and_tz(self, tz): # GH#2993, Timestamp cannot be constructed by datetime.date # and tz correctly - + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) result = Timestamp(date(2012, 3, 11), tz=tz) expected = Timestamp("3/11/2012", tz=tz) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 44db1187850c9..7b20f0a17556d 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -1,9 +1,11 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import pprint import dateutil.tz import pytest -import pytz # a test below uses pytz but only inside a `eval` call from pandas.compat import WASM @@ -181,14 +183,14 @@ def test_repr_matches_pydatetime_no_tz(self): ts_nanos_micros = Timestamp(1200) assert str(ts_nanos_micros) == "1970-01-01 00:00:00.000001200" - def test_repr_matches_pydatetime_tz_pytz(self): - dt_date = datetime(2013, 1, 2, tzinfo=pytz.utc) + def test_repr_matches_pydatetime_tz_stdlib(self): + dt_date = datetime(2013, 1, 2, tzinfo=timezone.utc) assert str(dt_date) == str(Timestamp(dt_date)) - dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=pytz.utc) + dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=timezone.utc) assert str(dt_datetime) == str(Timestamp(dt_datetime)) - dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=pytz.utc) + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=timezone.utc) assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) def test_repr_matches_pydatetime_tz_dateutil(self): diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 79fd285073983..38d0ddfbc13bd 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -9,6 +9,7 @@ import locale import time import unicodedata +import zoneinfo from dateutil.tz import ( tzlocal, @@ -20,8 +21,6 @@ ) import numpy as np import pytest -import pytz -from pytz import utc from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.timezones import ( @@ -259,7 +258,7 @@ def test_dow_parametric(self, ts, sign): class TestTimestamp: - @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) + @pytest.mark.parametrize("tz", [None, zoneinfo.ZoneInfo("US/Pacific")]) def test_disallow_setting_tz(self, tz): # GH#3746 ts = Timestamp("2010") @@ -311,7 +310,7 @@ def compare(x, y): assert int((Timestamp(x)._value - Timestamp(y)._value) / 1e9) == 0 compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now("UTC"), datetime.now(pytz.timezone("UTC"))) + compare(Timestamp.now("UTC"), datetime.now(timezone.utc)) compare(Timestamp.now("UTC"), datetime.now(tzutc())) msg = "Timestamp.utcnow is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -329,12 +328,12 @@ def compare(x, y): compare( # Support tz kwarg in Timestamp.fromtimestamp Timestamp.fromtimestamp(current_time, "UTC"), - datetime.fromtimestamp(current_time, utc), + datetime.fromtimestamp(current_time, timezone.utc), ) compare( # Support tz kwarg in Timestamp.fromtimestamp Timestamp.fromtimestamp(current_time, tz="UTC"), - datetime.fromtimestamp(current_time, utc), + datetime.fromtimestamp(current_time, timezone.utc), ) date_component = datetime.now(timezone.utc) @@ -585,9 +584,9 @@ def test_month_name(self, dt64, ts): assert ts.month_name() == alt.month_name() def test_tz_convert(self, ts): - ts = Timestamp._from_value_and_reso(ts._value, ts._creso, utc) + ts = Timestamp._from_value_and_reso(ts._value, ts._creso, timezone.utc) - tz = pytz.timezone("US/Pacific") + tz = zoneinfo.ZoneInfo("US/Pacific") result = ts.tz_convert(tz) assert isinstance(result, Timestamp) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 3b41c8ee463d8..97cafc33611ed 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -14,7 +14,6 @@ ) import numpy as np import pytest -import pytz from pandas._libs import index as libindex @@ -63,6 +62,7 @@ def test_fancy_setitem(): @pytest.mark.parametrize("tz_source", ["pytz", "dateutil"]) def test_getitem_setitem_datetime_tz(tz_source): if tz_source == "pytz": + pytz = pytest.importorskip(tz_source) tzget = pytz.timezone else: # handle special case for utc in dateutil diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 5002b6d20da09..228e5cb509982 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -432,28 +432,38 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py - # but checks for warnings instead of errors. - def _check_setitem_invalid(self, ser, invalid, indexer, warn): - msg = "Setting an item of incompatible dtype is deprecated" - msg = re.escape(msg) - + def _check_setitem_invalid(self, ser, invalid, indexer): orig_ser = ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser.loc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser[:] = invalid + def _check_setitem_valid(self, ser, value, indexer): + orig_ser = ser.copy() + + ser[indexer] = value + ser = orig_ser.copy() + + ser.iloc[indexer] = value + ser = orig_ser.copy() + + ser.loc[indexer] = value + ser = orig_ser.copy() + + ser[:] = value + _invalid_scalars = [ 1 + 2j, "True", @@ -471,20 +481,19 @@ def _check_setitem_invalid(self, ser, invalid, indexer, warn): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): ser = Series([True, False, False], dtype="bool") - self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) + self._check_setitem_invalid(ser, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) if isna(invalid) and invalid is not NaT and not np.isnat(invalid): - warn = None + self._check_setitem_valid(ser, invalid, indexer) else: - warn = FutureWarning - self._check_setitem_invalid(ser, invalid, indexer, warn) + self._check_setitem_invalid(ser, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): ser = Series([1, 2, None], dtype=float_numpy_dtype) - self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) + self._check_setitem_invalid(ser, invalid, indexer) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 69fba8925784e..253339f8a6446 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1,3 +1,4 @@ +import contextlib from datetime import ( date, datetime, @@ -273,25 +274,16 @@ def test_setitem_mask_align_and_promote(self): mask = ts > 0 left = ts.copy() right = ts[mask].copy().map(str) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left[mask] = right - expected = ts.map(lambda t: str(t) if t > 0 else t) - tm.assert_series_equal(left, expected) def test_setitem_mask_promote_strs(self): ser = Series([0, 1, 2, 0]) mask = ser > 0 ser2 = ser[mask].map(str) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): ser[mask] = ser2 - expected = Series([0, "1", "2", 0]) - tm.assert_series_equal(ser, expected) - def test_setitem_mask_promote(self): ser = Series([0, "foo", "bar", 0]) mask = Series([False, True, True, False]) @@ -379,12 +371,8 @@ def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(self): def test_setitem_nan_with_bool(self): # GH 13034 result = Series([True, False, True]) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): result[0] = np.nan - expected = Series([np.nan, False, True], dtype=object) - tm.assert_series_equal(result, expected) def test_setitem_mask_smallint_upcast(self): orig = Series([1, 2, 3], dtype="int8") @@ -393,22 +381,14 @@ def test_setitem_mask_smallint_upcast(self): mask = np.array([True, False, True]) ser = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): ser[mask] = Series(alt) - expected = Series([999, 2, 1001]) - tm.assert_series_equal(ser, expected) - ser2 = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): - ser2.mask(mask, alt, inplace=True) - tm.assert_series_equal(ser2, expected) + with pytest.raises(TypeError, match="Invalid value"): + ser.mask(mask, alt, inplace=True) - ser3 = orig.copy() - res = ser3.where(~mask, Series(alt)) + res = ser.where(~mask, Series(alt)) + expected = Series([999, 2, 1001]) tm.assert_series_equal(res, expected) def test_setitem_mask_smallint_no_upcast(self): @@ -575,32 +555,35 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): tm.assert_series_equal(ser, expected) @pytest.mark.parametrize( - "na, target_na, dtype, target_dtype, indexer, warn", + "na, target_na, dtype, target_dtype, indexer, raises", [ - (NA, NA, "Int64", "Int64", 1, None), - (NA, NA, "Int64", "Int64", 2, None), - (NA, np.nan, "int64", "float64", 1, None), - (NA, np.nan, "int64", "float64", 2, None), - (NaT, NaT, "int64", "object", 1, FutureWarning), - (NaT, NaT, "int64", "object", 2, None), - (np.nan, NA, "Int64", "Int64", 1, None), - (np.nan, NA, "Int64", "Int64", 2, None), - (np.nan, NA, "Float64", "Float64", 1, None), - (np.nan, NA, "Float64", "Float64", 2, None), - (np.nan, np.nan, "int64", "float64", 1, None), - (np.nan, np.nan, "int64", "float64", 2, None), + (NA, NA, "Int64", "Int64", 1, False), + (NA, NA, "Int64", "Int64", 2, False), + (NA, np.nan, "int64", "float64", 1, False), + (NA, np.nan, "int64", "float64", 2, False), + (NaT, NaT, "int64", "object", 1, True), + (NaT, NaT, "int64", "object", 2, False), + (np.nan, NA, "Int64", "Int64", 1, False), + (np.nan, NA, "Int64", "Int64", 2, False), + (np.nan, NA, "Float64", "Float64", 1, False), + (np.nan, NA, "Float64", "Float64", 2, False), + (np.nan, np.nan, "int64", "float64", 1, False), + (np.nan, np.nan, "int64", "float64", 2, False), ], ) def test_setitem_enlarge_with_na( - self, na, target_na, dtype, target_dtype, indexer, warn + self, na, target_na, dtype, target_dtype, indexer, raises ): # GH#32346 ser = Series([1, 2], dtype=dtype) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + ser[indexer] = na + else: ser[indexer] = na - expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] - expected = Series(expected_values, dtype=target_dtype) - tm.assert_series_equal(ser, expected) + expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] + expected = Series(expected_values, dtype=target_dtype) + tm.assert_series_equal(ser, expected) def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string): # GH#48665 @@ -694,14 +677,8 @@ def test_setitem_non_bool_into_bool(self, val, indexer_sli, unique): if not unique: ser.index = [1, 1] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_sli(ser)[1] = val - assert type(ser.iloc[1]) == type(val) - - expected = Series([True, val], dtype=object, index=ser.index) - if not unique and indexer_sli is not tm.iloc: - expected = Series([val, val], dtype=object, index=[1, 1]) - tm.assert_series_equal(ser, expected) def test_setitem_boolean_array_into_npbool(self): # GH#45462 @@ -712,10 +689,8 @@ def test_setitem_boolean_array_into_npbool(self): ser[:2] = arr[:2] # no NAs -> can set inplace assert ser._values is values - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[1:] = arr[1:] # has an NA -> cast to boolean dtype - expected = Series(arr) - tm.assert_series_equal(ser, expected) class SetitemCastingEquivalents: @@ -759,64 +734,72 @@ def _check_inplace(self, is_inplace, orig, arr, obj): # otherwise original array should be unchanged tm.assert_equal(arr, orig._values) - def test_int_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + def test_int_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): if not isinstance(key, int): pytest.skip("Not relevant for int key") + if raises: + ctx = pytest.raises(TypeError, match="Invalid value") + else: + ctx = contextlib.nullcontext() - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) if indexer_sli is tm.loc: - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, tm.at, is_inplace) elif indexer_sli is tm.iloc: - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, tm.iat, is_inplace) rng = range(key, key + 1) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, rng, expected, val, indexer_sli, is_inplace) if indexer_sli is not tm.loc: # Note: no .loc because that handles slice edges differently slc = slice(key, key + 1) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, slc, expected, val, indexer_sli, is_inplace) ilkey = [key] - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) indkey = np.array(ilkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) genkey = (x for x in [key]) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, genkey, expected, val, indexer_sli, is_inplace) - def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + def test_slice_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): if not isinstance(key, slice): pytest.skip("Not relevant for slice key") + if raises: + ctx = pytest.raises(TypeError, match="Invalid value") + else: + ctx = contextlib.nullcontext() if indexer_sli is not tm.loc: # Note: no .loc because that handles slice edges differently - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) ilkey = list(range(len(obj)))[key] - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) indkey = np.array(ilkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) genkey = (x for x in indkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, genkey, expected, val, indexer_sli, is_inplace) - def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): + def test_mask_key(self, obj, key, expected, raises, val, indexer_sli): # setitem with boolean mask mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -829,11 +812,13 @@ def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): indexer_sli(obj)[mask] = val return - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + indexer_sli(obj)[mask] = val + else: indexer_sli(obj)[mask] = val - tm.assert_series_equal(obj, expected) - def test_series_where(self, obj, key, expected, warn, val, is_inplace): + def test_series_where(self, obj, key, expected, raises, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -860,7 +845,7 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - def test_index_where(self, obj, key, expected, warn, val, using_infer_string): + def test_index_where(self, obj, key, expected, raises, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -872,7 +857,7 @@ def test_index_where(self, obj, key, expected, warn, val, using_infer_string): expected_idx = Index(expected, dtype=expected.dtype) tm.assert_index_equal(res, expected_idx) - def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): + def test_index_putmask(self, obj, key, expected, raises, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -885,7 +870,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): @pytest.mark.parametrize( - "obj,expected,key,warn", + "obj,expected,key,raises", [ pytest.param( # GH#45568 setting a valid NA value into IntervalDtype[int] should @@ -896,7 +881,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): dtype="interval[float64]", ), 1, - FutureWarning, + True, id="interval_int_na_value", ), pytest.param( @@ -904,14 +889,14 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]), slice(None, None, 2), - None, + False, id="int_series_slice_key_step", ), pytest.param( Series([True, True, False, False]), Series([np.nan, True, np.nan, False], dtype=object), slice(None, None, 2), - FutureWarning, + True, id="bool_series_slice_key_step", ), pytest.param( @@ -919,7 +904,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series(np.arange(10)), Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]), slice(None, 5), - None, + False, id="int_series_slice_key", ), pytest.param( @@ -927,7 +912,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([1, 2, 3]), Series([np.nan, 2, 3]), 0, - None, + False, id="int_series_int_key", ), pytest.param( @@ -936,7 +921,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([np.nan], dtype=object), # TODO: maybe go to float64 since we are changing the _whole_ Series? 0, - FutureWarning, + True, id="bool_series_int_key_change_all", ), pytest.param( @@ -944,7 +929,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([False, True]), Series([np.nan, True], dtype=object), 0, - FutureWarning, + True, id="bool_series_int_key", ), ], @@ -994,8 +979,8 @@ def key(self): return 0 @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class TestSetitemDT64IntoInt(SetitemCastingEquivalents): @@ -1034,8 +1019,8 @@ def val(self, scalar, request): return box([scalar, scalar]) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class TestSetitemNAPeriodDtype(SetitemCastingEquivalents): @@ -1061,8 +1046,8 @@ def val(self, request): return request.param @pytest.fixture - def warn(self): - return None + def raises(self): + return False class TestSetitemNADatetimeLikeDtype(SetitemCastingEquivalents): @@ -1114,8 +1099,8 @@ def key(self): return 0 @pytest.fixture - def warn(self, is_inplace): - return None if is_inplace else FutureWarning + def raises(self, is_inplace): + return False if is_inplace else True class TestSetitemMismatchedTZCastsToObject(SetitemCastingEquivalents): @@ -1146,24 +1131,23 @@ def expected(self, obj, val): return expected @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "obj,expected,warn", + "obj,expected", [ # For numeric series, we should coerce to NaN. - (Series([1, 2, 3]), Series([np.nan, 2, 3]), None), - (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0]), None), + (Series([1, 2, 3]), Series([np.nan, 2, 3])), + (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0])), # For datetime series, we should coerce to NaT. ( Series([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]), Series([NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), - None, ), # For objects, we should preserve the None value. - (Series(["foo", "bar", "baz"]), Series([None, "bar", "baz"]), None), + (Series(["foo", "bar", "baz"]), Series([None, "bar", "baz"])), ], ) class TestSeriesNoneCoercion(SetitemCastingEquivalents): @@ -1175,6 +1159,10 @@ def key(self): def val(self): return None + @pytest.fixture + def raises(self): + return False + class TestSetitemFloatIntervalWithIntIntervalValues(SetitemCastingEquivalents): # GH#44201 Cast to shared IntervalDtype rather than object @@ -1185,11 +1173,8 @@ def test_setitem_example(self): obj = Series(idx) val = Interval(0.5, 1.5) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj[0] = val - assert obj.dtype == "Interval[float64, right]" @pytest.fixture def obj(self): @@ -1211,8 +1196,8 @@ def expected(self, obj, val): return Series(idx) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class TestSetitemRangeIntoIntegerSeries(SetitemCastingEquivalents): @@ -1240,18 +1225,18 @@ def expected(self, any_int_numpy_dtype): return exp @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val, warn", + "val, raises", [ - (np.array([2.0, 3.0]), None), - (np.array([2.5, 3.5]), FutureWarning), + (np.array([2.0, 3.0]), False), + (np.array([2.5, 3.5]), True), ( np.array([2**65, 2**65 + 1], dtype=np.float64), - FutureWarning, + True, ), # all ints, but can't cast ], ) @@ -1291,8 +1276,8 @@ def expected(self): return Series([1, 512, 3], dtype=np.int16) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True @pytest.mark.parametrize("val", [2**33 + 1.0, 2**33 + 1.1, 2**62]) @@ -1315,8 +1300,8 @@ def expected(self, val): return Series([val, 2, 3], dtype=dtype) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class CoercionTest(SetitemCastingEquivalents): @@ -1334,8 +1319,8 @@ def expected(self, obj, key, val, exp_dtype): @pytest.mark.parametrize( - "val,exp_dtype,warn", - [(np.int32(1), np.int8, None), (np.int16(2**9), np.int16, FutureWarning)], + "val,exp_dtype,raises", + [(np.int32(1), np.int8, None), (np.int16(2**9), np.int16, True)], ) class TestCoercionInt8(CoercionTest): # previously test_setitem_series_int8 in tests.indexing.test_coercion @@ -1353,17 +1338,17 @@ def obj(self): return Series(["a", "b", "c", "d"], dtype=object) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.complex128, None), - (1.1, np.complex128, None), - (1 + 1j, np.complex128, None), - (True, object, FutureWarning), + (1, np.complex128, False), + (1.1, np.complex128, False), + (1 + 1j, np.complex128, False), + (True, object, True), ], ) class TestCoercionComplex(CoercionTest): @@ -1374,14 +1359,14 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, object, FutureWarning), - ("3", object, FutureWarning), - (3, object, FutureWarning), - (1.1, object, FutureWarning), - (1 + 1j, object, FutureWarning), - (True, bool, None), + (1, object, True), + ("3", object, True), + (3, object, True), + (1.1, object, True), + (1 + 1j, object, True), + (True, bool, False), ], ) class TestCoercionBool(CoercionTest): @@ -1392,12 +1377,12 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.int64, None), - (1.1, np.float64, FutureWarning), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), + (1, np.int64, False), + (1.1, np.float64, True), + (1 + 1j, np.complex128, True), + (True, object, True), ], ) class TestCoercionInt64(CoercionTest): @@ -1408,12 +1393,12 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.float64, None), - (1.1, np.float64, None), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), + (1, np.float64, False), + (1.1, np.float64, False), + (1 + 1j, np.complex128, True), + (True, object, True), ], ) class TestCoercionFloat64(CoercionTest): @@ -1424,13 +1409,13 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.float32, None), + (1, np.float32, False), pytest.param( 1.1, np.float32, - None, + False, marks=pytest.mark.xfail( ( not np_version_gte1p24 @@ -1440,16 +1425,16 @@ def obj(self): "np_can_hold_element raises and we cast to float64", ), ), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), - (np.uint8(2), np.float32, None), - (np.uint32(2), np.float32, None), + (1 + 1j, np.complex128, True), + (True, object, True), + (np.uint8(2), np.float32, False), + (np.uint32(2), np.float32, False), # float32 cannot hold np.iinfo(np.uint32).max exactly # (closest it can hold is 4294967300.0 which off by 5.0), so # we cast to float64 - (np.uint32(np.iinfo(np.uint32).max), np.float64, FutureWarning), - (np.uint64(2), np.float32, None), - (np.int64(2), np.float32, None), + (np.uint32(np.iinfo(np.uint32).max), np.float64, True), + (np.uint64(2), np.float32, False), + (np.int64(2), np.float32, False), ], ) class TestCoercionFloat32(CoercionTest): @@ -1457,8 +1442,8 @@ class TestCoercionFloat32(CoercionTest): def obj(self): return Series([1.1, 2.2, 3.3, 4.4], dtype=np.float32) - def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): - super().test_slice_key(obj, key, expected, warn, val, indexer_sli, is_inplace) + def test_slice_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): + super().test_slice_key(obj, key, expected, raises, val, indexer_sli, is_inplace) if isinstance(val, float): # the xfail would xpass bc test_slice_key short-circuits @@ -1494,16 +1479,16 @@ def val(self, exp_dtype): return ts @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timestamp("2012-01-01"), "datetime64[ns]", None), - (1, object, FutureWarning), - ("x", object, FutureWarning), + (Timestamp("2012-01-01"), "datetime64[ns]", False), + (1, object, True), + ("x", object, True), ], ) class TestCoercionDatetime64(CoercionTest): @@ -1514,18 +1499,18 @@ def obj(self): return Series(date_range("2011-01-01", freq="D", periods=4)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]", None), + (Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]", False), # pre-2.0, a mis-matched tz would end up casting to object - (Timestamp("2012-01-01", tz="US/Pacific"), "datetime64[ns, US/Eastern]", None), - (Timestamp("2012-01-01"), object, FutureWarning), - (1, object, FutureWarning), + (Timestamp("2012-01-01", tz="US/Pacific"), "datetime64[ns, US/Eastern]", False), + (Timestamp("2012-01-01"), object, True), + (1, object, True), ], ) class TestCoercionDatetime64TZ(CoercionTest): @@ -1536,16 +1521,16 @@ def obj(self): return Series(date_range("2011-01-01", freq="D", periods=4, tz=tz)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timedelta("12 day"), "timedelta64[ns]", None), - (1, object, FutureWarning), - ("x", object, FutureWarning), + (Timedelta("12 day"), "timedelta64[ns]", False), + (1, object, True), + ("x", object, True), ], ) class TestCoercionTimedelta64(CoercionTest): @@ -1555,8 +1540,8 @@ def obj(self): return Series(timedelta_range("1 day", periods=4)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( @@ -1575,63 +1560,45 @@ def obj(self, request): return Series(request.param) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True def test_20643(): # closed by GH#45121 orig = Series([0, 1, 2], index=["a", "b", "c"]) - expected = Series([0, 2.7, 2], index=["a", "b", "c"]) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.at["b"] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.loc["b"] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser["b"] = 2.7 - tm.assert_series_equal(ser, expected) ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iat[1] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[1] = 2.7 - tm.assert_series_equal(ser, expected) orig_df = orig.to_frame("A") - expected_df = expected.to_frame("A") df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.at["b", "A"] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc["b", "A"] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[1, 0] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.iat[1, 0] = 2.7 - tm.assert_frame_equal(df, expected_df) def test_20643_comment(): @@ -1653,35 +1620,23 @@ def test_15413(): # fixed by GH#45121 ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[ser == 2] += 0.5 - expected = Series([1, 2.5, 3]) - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.loc[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iat[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.at[1] += 0.5 - tm.assert_series_equal(ser, expected) def test_32878_int_itemsize(): @@ -1689,10 +1644,8 @@ def test_32878_int_itemsize(): arr = np.arange(5).astype("i4") ser = Series(arr) val = np.int64(np.iinfo(np.int64).max) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = val - expected = Series([val, 1, 2, 3, 4], dtype=np.int64) - tm.assert_series_equal(ser, expected) def test_32878_complex_itemsize(): @@ -1702,20 +1655,15 @@ def test_32878_complex_itemsize(): val = val.astype("c16") # GH#32878 used to coerce val to inf+0.000000e+00j - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = val - assert ser[0] == val - expected = Series([val, 1, 2, 3, 4], dtype="c16") - tm.assert_series_equal(ser, expected) def test_37692(indexer_al): # GH#37692 ser = Series([1, 2, 3], index=["a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_al(ser)["b"] = "test" - expected = Series([1, "test", 3], index=["a", "b", "c"], dtype=object) - tm.assert_series_equal(ser, expected) def test_setitem_bool_int_float_consistency(indexer_sli): @@ -1725,14 +1673,12 @@ def test_setitem_bool_int_float_consistency(indexer_sli): # as the setitem can be done losslessly for dtype in [np.float64, np.int64]: ser = Series(0, index=range(3), dtype=dtype) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_sli(ser)[0] = True - assert ser.dtype == object ser = Series(0, index=range(3), dtype=bool) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = dtype(1) - assert ser.dtype == object # 1.0 can be held losslessly, so no casting ser = Series(0, index=range(3), dtype=np.int64) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 4979bcb42d7ab..7718899ff234b 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -55,15 +55,13 @@ def test_where_unsafe_upcast(dtype, expected_dtype): s = Series(np.arange(10), dtype=dtype) values = [2.5, 3.5, 4.5, 5.5, 6.5] mask = s < 5 - expected = Series(values + list(range(5, 10)), dtype=expected_dtype) - warn = ( - None - if np.dtype(dtype).kind == np.dtype(expected_dtype).kind == "f" - else FutureWarning - ) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if np.dtype(dtype).kind == np.dtype(expected_dtype).kind == "f": s[mask] = values - tm.assert_series_equal(s, expected) + expected = Series(values + list(range(5, 10)), dtype=expected_dtype) + tm.assert_series_equal(s, expected) + else: + with pytest.raises(TypeError, match="Invalid value"): + s[mask] = values def test_where_unsafe(): @@ -74,9 +72,10 @@ def test_where_unsafe(): mask = s > 5 expected = Series(list(range(6)) + values, dtype="float64") - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): s[mask] = values - tm.assert_series_equal(s, expected) + s = s.astype("float64") + s[mask] = values # see gh-3235 s = Series(np.arange(10), dtype="int64") diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 432c0eceee011..c1082c06ce307 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -20,21 +20,15 @@ def test_argsort_axis(self): def test_argsort_numpy(self, datetime_series): ser = datetime_series - res = np.argsort(ser).values expected = np.argsort(np.array(ser)) tm.assert_numpy_array_equal(res, expected) - # with missing values - ts = ser.copy() - ts[::2] = np.nan - - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False - ): - result = np.argsort(ts)[1::2] - expected = np.argsort(np.array(ts.dropna())) + def test_argsort_numpy_missing(self): + data = [0.1, np.nan, 0.2, np.nan, 0.3] + ser = Series(data) + result = np.argsort(ser) + expected = np.argsort(np.array(data)) tm.assert_numpy_array_equal(result.values, expected) @@ -56,10 +50,8 @@ def test_argsort_dt64(self, unit): expected = Series(range(5), dtype=np.intp) tm.assert_series_equal(result, expected) - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = shifted.argsort() - expected = Series(list(range(4)) + [-1], dtype=np.intp) + result = shifted.argsort() + expected = Series(list(range(4)) + [4], dtype=np.intp) tm.assert_series_equal(result, expected) def test_argsort_stable(self): diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 4b2122e25f819..d2d92d7273d3d 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -298,7 +298,7 @@ def test_astype_str_cast_dt64(self): def test_astype_str_cast_td64(self): # see GH#9757 - td = Series([Timedelta(1, unit="d")]) + td = Series([Timedelta(1, unit="D")]) ser = td.astype(str) expected = Series(["1 days"], dtype=object) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index f6f3a3b0fb07e..7c96a5b0f00d1 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -231,7 +231,7 @@ def test_convert_dtypes( copy = series.copy(deep=True) if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result[result.notna()] = np.nan else: result[result.notna()] = np.nan diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index c10bb8278a3d1..f53d75df83124 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -6,7 +6,6 @@ import numpy as np import pytest -import pytz from pandas import ( Categorical, @@ -159,9 +158,8 @@ def test_fillna_consistency(self): # assignment ser2 = ser.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser2[1] = "foo" - tm.assert_series_equal(ser2, expected) def test_timedelta_fillna(self, frame_or_series, unit): # GH#3371 @@ -861,7 +859,7 @@ def test_fillna_bug(self): def test_ffill_mixed_dtypes_without_missing_data(self): # GH#14956 - series = Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1]) + series = Series([datetime(2015, 1, 1, tzinfo=timezone.utc), 1]) result = series.ffill() tm.assert_series_equal(series, result) @@ -923,16 +921,16 @@ def test_datetime64tz_fillna_round_issue(self): # GH#14872 data = Series( - [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] + [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc)] ) filled = data.bfill() expected = Series( [ - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), ] ) diff --git a/pandas/tests/series/methods/test_get_numeric_data.py b/pandas/tests/series/methods/test_get_numeric_data.py index f25583904377a..4a11d7905f506 100644 --- a/pandas/tests/series/methods/test_get_numeric_data.py +++ b/pandas/tests/series/methods/test_get_numeric_data.py @@ -1,5 +1,4 @@ from pandas import ( - Index, Series, date_range, ) @@ -19,7 +18,7 @@ def test_get_numeric_data_preserve_dtype(self): obj = Series([1, "2", 3.0]) result = obj._get_numeric_data() - expected = Series([], dtype=object, index=Index([], dtype=object)) + expected = Series([], dtype=object) tm.assert_series_equal(result, expected) obj = Series([True, False, True]) @@ -28,5 +27,5 @@ def test_get_numeric_data_preserve_dtype(self): obj = Series(date_range("20130101", periods=3)) result = obj._get_numeric_data() - expected = Series([], dtype="M8[ns]", index=Index([], dtype=object)) + expected = Series([], dtype="M8[ns]") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 937b85a547bcd..e997ae32cf2e2 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -92,7 +92,7 @@ def test_isin_with_i8(self): tm.assert_series_equal(result, expected) # timedelta64[ns] - s = Series(pd.to_timedelta(range(5), unit="d")) + s = Series(pd.to_timedelta(range(5), unit="D")) result = s.isin(s[0:2]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 56b7cf42a798d..6a5b58c5da6b5 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -46,7 +46,7 @@ def test_nlargest_error(self, r, method, arg): [ pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), pd.to_datetime(["2003", "2002", "2001", "2002", "2005"], utc=True), - pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), + pd.to_timedelta(["3D", "2D", "1D", "2D", "5D"]), np.array([3, 2, 1, 2, 5], dtype="int8"), np.array([3, 2, 1, 2, 5], dtype="int16"), np.array([3, 2, 1, 2, 5], dtype="int32"), diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0a79bcea679a7..90654df155cf0 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -370,9 +370,7 @@ def test_replace_mixed_types_with_string(self): def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(pd.Categorical(categorical, categories=["A", "B"])) - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace({"A": 1, "B": 2}) + result = ser.cat.rename_categories({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present @@ -380,16 +378,13 @@ def test_replace_categorical(self, categorical, numeric): expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result, check_categorical=False) - @pytest.mark.parametrize( - "data, data_exp", [(["a", "b", "c"], ["b", "b", "c"]), (["a"], ["b"])] - ) - def test_replace_categorical_inplace(self, data, data_exp): + def test_replace_categorical_inplace(self): # GH 53358 + data = ["a", "b", "c"] + data_exp = ["b", "b", "c"] result = pd.Series(data, dtype="category") - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result.replace(to_replace="a", value="b", inplace=True) - expected = pd.Series(data_exp, dtype="category") + result.replace(to_replace="a", value="b", inplace=True) + expected = pd.Series(pd.Categorical(data_exp, categories=data)) tm.assert_series_equal(result, expected) def test_replace_categorical_single(self): @@ -404,25 +399,10 @@ def test_replace_categorical_single(self): expected = expected.cat.remove_unused_categories() assert c[2] != "foo" - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = c.replace(c[2], "foo") + result = c.cat.rename_categories({c.values[2]: "foo"}) tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[2], "foo", inplace=True) - assert return_value is None - tm.assert_series_equal(expected, c) - - first_value = c[0] - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[1], c[0], inplace=True) - assert return_value is None - assert c[0] == c[1] == first_value # test replacing with existing value - def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index 1d29e116be5c2..9b5fb098bf3ee 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -35,37 +35,39 @@ def test_update(self): tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - "other, dtype, expected, warn", + "other, dtype, expected, raises", [ # other is int - ([61, 63], "int32", Series([10, 61, 12], dtype="int32"), None), - ([61, 63], "int64", Series([10, 61, 12]), None), - ([61, 63], float, Series([10.0, 61.0, 12.0]), None), - ([61, 63], object, Series([10, 61, 12], dtype=object), None), + ([61, 63], "int32", Series([10, 61, 12], dtype="int32"), False), + ([61, 63], "int64", Series([10, 61, 12]), False), + ([61, 63], float, Series([10.0, 61.0, 12.0]), False), + ([61, 63], object, Series([10, 61, 12], dtype=object), False), # other is float, but can be cast to int - ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32"), None), - ([61.0, 63.0], "int64", Series([10, 61, 12]), None), - ([61.0, 63.0], float, Series([10.0, 61.0, 12.0]), None), - ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object), None), + ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32"), False), + ([61.0, 63.0], "int64", Series([10, 61, 12]), False), + ([61.0, 63.0], float, Series([10.0, 61.0, 12.0]), False), + ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object), False), # others is float, cannot be cast to int - ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0]), FutureWarning), - ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0]), FutureWarning), - ([61.1, 63.1], float, Series([10.0, 61.1, 12.0]), None), - ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object), None), + ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0]), True), + ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0]), True), + ([61.1, 63.1], float, Series([10.0, 61.1, 12.0]), False), + ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object), False), # other is object, cannot be cast - ([(61,), (63,)], "int32", Series([10, (61,), 12]), FutureWarning), - ([(61,), (63,)], "int64", Series([10, (61,), 12]), FutureWarning), - ([(61,), (63,)], float, Series([10.0, (61,), 12.0]), FutureWarning), - ([(61,), (63,)], object, Series([10, (61,), 12]), None), + ([(61,), (63,)], "int32", Series([10, (61,), 12]), True), + ([(61,), (63,)], "int64", Series([10, (61,), 12]), True), + ([(61,), (63,)], float, Series([10.0, (61,), 12.0]), True), + ([(61,), (63,)], object, Series([10, (61,), 12]), False), ], ) - def test_update_dtypes(self, other, dtype, expected, warn): + def test_update_dtypes(self, other, dtype, expected, raises): ser = Series([10, 11, 12], dtype=dtype) other = Series(other, index=[1, 3]) - with tm.assert_produces_warning(warn, match="item of incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + ser.update(other) + else: ser.update(other) - - tm.assert_series_equal(ser, expected) + tm.assert_series_equal(ser, expected) @pytest.mark.parametrize( "values, other, expected", diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 108c3aabb1aa4..1c88329a83b0e 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -37,13 +37,8 @@ def test_timedelta64_nan(self): assert not isna(td1[0]) # GH#16674 iNaT is treated as an integer when given by the user - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): td1[1] = iNaT - assert not isna(td1[1]) - assert td1.dtype == np.object_ - assert td1[1] == iNaT - td1[1] = td[1] - assert not isna(td1[1]) td1[2] = NaT assert isna(td1[2]) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 134ebededd163..cdcd36846c560 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -873,6 +873,14 @@ def test_unique_masked(self, any_numeric_ea_dtype): expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype) tm.assert_extension_array_equal(result, expected) + def test_unique_NumpyExtensionArray(self): + arr_complex = pd.array( + [1 + 1j, 2, 3] + ) # NumpyEADtype('complex128') => NumpyExtensionArray + result = pd.unique(arr_complex) + expected = pd.array([1 + 1j, 2 + 0j, 3 + 0j]) + tm.assert_extension_array_equal(result, expected) + def test_nunique_ints(index_or_series_or_array): # GH#36327 @@ -1638,7 +1646,10 @@ def test_unique_tuples(self, arr, uniques): expected = np.empty(len(uniques), dtype=object) expected[:] = uniques - msg = "unique requires a Series, Index, ExtensionArray, or np.ndarray, got list" + msg = ( + r"unique requires a Series, Index, ExtensionArray, np.ndarray " + r"or NumpyExtensionArray got list" + ) with pytest.raises(TypeError, match=msg): # GH#52986 pd.unique(arr) @@ -1657,7 +1668,11 @@ def test_unique_tuples(self, arr, uniques): ) def test_unique_complex_numbers(self, array, expected): # GH 17927 - msg = "unique requires a Series, Index, ExtensionArray, or np.ndarray, got list" + msg = ( + r"unique requires a Series, Index, ExtensionArray, np.ndarray " + r"or NumpyExtensionArray got list" + ) + with pytest.raises(TypeError, match=msg): # GH#52986 pd.unique(array) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 53b730cfd9b6e..ca97af0d3eb32 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -3,7 +3,6 @@ import string import subprocess import sys -import textwrap import numpy as np import pytest @@ -250,21 +249,3 @@ def test_str_size(): ] result = subprocess.check_output(call).decode()[-4:-1].strip("\n") assert int(result) == int(expected) - - -@pytest.mark.single_cpu -def test_bz2_missing_import(): - # Check whether bz2 missing import is handled correctly (issue #53857) - code = """ - import sys - sys.modules['bz2'] = None - import pytest - import pandas as pd - from pandas.compat import get_bz2_file - msg = 'bz2 module not available.' - with pytest.raises(RuntimeError, match=msg): - get_bz2_file() - """ - code = textwrap.dedent(code) - call = [sys.executable, "-c", code] - subprocess.check_output(call) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8f661edf0f241..e87498742061b 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -288,6 +288,13 @@ def test_multiindex_with_na(self): tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize("na", [None, np.nan]) + def test_multiindex_insert_level_with_na(self, na): + # GH 59003 + df = DataFrame([0], columns=[["A"], ["B"]]) + df[na, "B"] = 1 + tm.assert_frame_equal(df[na], DataFrame([1], columns=["B"])) + class TestSorted: """everything you wanted to test about sorting""" diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 132608d7df115..56de3f7f39175 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -104,7 +104,9 @@ def test_int64_overflow_groupby_large_df_shuffled(self, agg): gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! - assert is_int64_overflow_possible(gr._grouper.shape) + assert is_int64_overflow_possible( + tuple(ping.ngroups for ping in gr._grouper.groupings) + ) mi = MultiIndex.from_arrays( [ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)], diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index ce2e4e0f6cec5..451ef42fff3d1 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -5,6 +5,7 @@ from pandas._libs import iNaT +from pandas import array import pandas._testing as tm import pandas.core.algorithms as algos @@ -303,7 +304,14 @@ def test_take_coerces_list(self): arr = [1, 2, 3] msg = ( "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, " - "Index, or Series, got list" + "Index, Series, or NumpyExtensionArray got list" ) with pytest.raises(TypeError, match=msg): algos.take(arr, [0, 0]) + + def test_take_NumpyExtensionArray(self): + # GH#59177 + arr = array([1 + 1j, 2, 3]) # NumpyEADtype('complex128') (NumpyExtensionArray) + assert algos.take(arr, [2]) == 2 + arr = array([1, 2, 3]) # Int64Dtype() (ExtensionArray) + assert algos.take(arr, [2]) == 2 diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index cbbd018720bad..c1d6baaf17c92 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -10,11 +10,11 @@ ) from decimal import Decimal import locale +import zoneinfo from dateutil.parser import parse import numpy as np import pytest -import pytz from pandas._libs import tslib from pandas._libs.tslibs import ( @@ -432,9 +432,11 @@ def test_to_datetime_format_weeks(self, value, fmt, expected, cache): ["2010-01-01 12:00:00 Z", "2010-01-01 12:00:00 Z"], [ Timestamp( - "2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0) - ), # pytz coerces to UTC - Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0)), + "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=0)) + ), + Timestamp( + "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=0)) + ), ], ], ], @@ -1169,6 +1171,7 @@ def test_to_datetime_different_offsets_removed(self, cache): def test_to_datetime_tz_pytz(self, cache): # see gh-8260 + pytz = pytest.importorskip("pytz") us_eastern = pytz.timezone("US/Eastern") arr = np.array( [ @@ -1699,7 +1702,9 @@ def test_to_datetime_fixed_offset(self): ["2020-10-26 00:00:00+06:00", Timestamp("2018-01-01", tz="US/Pacific")], [ "2020-10-26 00:00:00+06:00", - datetime(2020, 1, 1, 18, tzinfo=pytz.timezone("Australia/Melbourne")), + datetime(2020, 1, 1, 18).astimezone( + zoneinfo.ZoneInfo("Australia/Melbourne") + ), ], ], ) @@ -2351,7 +2356,7 @@ def test_to_datetime_iso8601_non_padded(self, input, format): ) def test_to_datetime_iso8601_with_timezone_valid(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 - expected = Timestamp(2020, 1, 1, tzinfo=pytz.UTC) + expected = Timestamp(2020, 1, 1, tzinfo=timezone.utc) result = to_datetime(input, format=format) assert result == expected @@ -2778,7 +2783,7 @@ def test_infer_datetime_format_zero_tz(self, ts, zero_tz): # GH 41047 ser = Series([ts + zero_tz]) result = to_datetime(ser) - tz = pytz.utc if zero_tz == "Z" else None + tz = timezone.utc if zero_tz == "Z" else None expected = Series([Timestamp(ts, tz=tz)]) tm.assert_series_equal(result, expected) @@ -3182,7 +3187,7 @@ def test_invalid_origin(self, unit): ) def test_epoch(self, units, epochs): epoch_1960 = Timestamp(1960, 1, 1) - units_from_epochs = list(range(5)) + units_from_epochs = np.arange(5, dtype=np.int64) expected = Series( [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs] ) @@ -3213,7 +3218,7 @@ def test_invalid_origins(self, origin, exc, units): def test_invalid_origins_tzinfo(self): # GH16842 with pytest.raises(ValueError, match="must be tz-naive"): - to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) + to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=timezone.utc)) def test_incorrect_value_exception(self): # GH47495 diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 894f49b2fa140..9ec2689069da9 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -56,7 +56,10 @@ def test_to_timedelta_same_np_timedelta64(self): def test_to_timedelta_series(self): # Series expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) - result = to_timedelta(Series(["1d", "1days 00:00:01"])) + + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_timedelta(Series(["1d", "1days 00:00:01"])) tm.assert_series_equal(result, expected) def test_to_timedelta_units(self): diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index 08f4a1250392e..ffe6ff0b51bcf 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -1,7 +1,9 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import pytest -from pytz import utc from pandas import ( DatetimeIndex, @@ -128,9 +130,9 @@ def test_holiday_dates(holiday, start_date, end_date, expected): # Verify that timezone info is preserved. assert list( holiday.dates( - utc.localize(Timestamp(start_date)), utc.localize(Timestamp(end_date)) + Timestamp(start_date, tz=timezone.utc), Timestamp(end_date, tz=timezone.utc) ) - ) == [utc.localize(dt) for dt in expected] + ) == [dt.replace(tzinfo=timezone.utc) for dt in expected] @pytest.mark.parametrize( @@ -194,8 +196,10 @@ def test_holidays_within_dates(holiday, start, expected): # Verify that timezone info is preserved. assert list( - holiday.dates(utc.localize(Timestamp(start)), utc.localize(Timestamp(start))) - ) == [utc.localize(dt) for dt in expected] + holiday.dates( + Timestamp(start, tz=timezone.utc), Timestamp(start, tz=timezone.utc) + ) + ) == [dt.replace(tzinfo=timezone.utc) for dt in expected] @pytest.mark.parametrize( diff --git a/pandas/tests/tseries/offsets/test_dst.py b/pandas/tests/tseries/offsets/test_dst.py index 8ff80536fc69e..dfdc69c0fe18e 100644 --- a/pandas/tests/tseries/offsets/test_dst.py +++ b/pandas/tests/tseries/offsets/test_dst.py @@ -5,7 +5,6 @@ from datetime import timedelta import pytest -import pytz from pandas._libs.tslibs import Timestamp from pandas._libs.tslibs.offsets import ( @@ -33,10 +32,8 @@ from pandas import DatetimeIndex import pandas._testing as tm -from pandas.util.version import Version -# error: Module has no attribute "__version__" -pytz_version = Version(pytz.__version__) # type: ignore[attr-defined] +pytz = pytest.importorskip("pytz") def get_utc_offset_hours(ts): @@ -52,7 +49,10 @@ class TestDST: # test both basic names and dateutil timezones timezone_utc_offsets = { - "US/Eastern": {"utc_offset_daylight": -4, "utc_offset_standard": -5}, + pytz.timezone("US/Eastern"): { + "utc_offset_daylight": -4, + "utc_offset_standard": -5, + }, "dateutil/US/Pacific": {"utc_offset_daylight": -7, "utc_offset_standard": -8}, } valid_date_offsets_singular = [ @@ -96,7 +96,10 @@ def _test_offset( if ( offset_name in ["hour", "minute", "second", "microsecond"] and offset_n == 1 - and tstart == Timestamp("2013-11-03 01:59:59.999999-0500", tz="US/Eastern") + and tstart + == Timestamp( + "2013-11-03 01:59:59.999999-0500", tz=pytz.timezone("US/Eastern") + ) ): # This addition results in an ambiguous wall time err_msg = { @@ -147,7 +150,9 @@ def _test_offset( assert datepart_offset == offset.kwds[offset_name] else: # the offset should be the same as if it was done in UTC - assert t == (tstart.tz_convert("UTC") + offset).tz_convert("US/Pacific") + assert t == (tstart.tz_convert("UTC") + offset).tz_convert( + pytz.timezone("US/Pacific") + ) def _make_timestamp(self, string, hrs_offset, tz): if hrs_offset >= 0: @@ -224,16 +229,6 @@ def test_all_offset_classes(self, tup): @pytest.mark.parametrize( "original_dt, target_dt, offset, tz", [ - pytest.param( - Timestamp("1900-01-01"), - Timestamp("1905-07-01"), - MonthBegin(66), - "Africa/Lagos", - marks=pytest.mark.xfail( - pytz_version < Version("2020.5") or pytz_version == Version("2022.2"), - reason="GH#41906: pytz utc transition dates changed", - ), - ), ( Timestamp("2021-10-01 01:15"), Timestamp("2021-10-31 01:15"), @@ -263,7 +258,7 @@ def test_all_offset_classes(self, tup): def test_nontick_offset_with_ambiguous_time_error(original_dt, target_dt, offset, tz): # .apply for non-Tick offsets throws AmbiguousTimeError when the target dt # is dst-ambiguous - localized_dt = original_dt.tz_localize(tz) + localized_dt = original_dt.tz_localize(pytz.timezone(tz)) msg = f"Cannot infer dst time from {target_dt}, try using the 'ambiguous' argument" with pytest.raises(pytz.AmbiguousTimeError, match=msg): diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 6a0b86cbd03ee..f62910b5e1f1c 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -1,8 +1,10 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest -from pytz import UTC from pandas._libs.tslibs import ( OutOfBoundsTimedelta, @@ -55,7 +57,7 @@ def _compare_local_to_utc(tz_didx, naive_didx): def test_tz_localize_to_utc_copies(): # GH#46460 arr = np.arange(5, dtype="i8") - result = tz_convert_from_utc(arr, tz=UTC) + result = tz_convert_from_utc(arr, tz=timezone.utc) tm.assert_numpy_array_equal(result, arr) assert not np.shares_memory(arr, result) @@ -100,7 +102,7 @@ def test_tz_convert_readonly(): # GH#35530 arr = np.array([0], dtype=np.int64) arr.setflags(write=False) - result = tz_convert_from_utc(arr, UTC) + result = tz_convert_from_utc(arr, timezone.utc) tm.assert_numpy_array_equal(result, arr) @@ -141,14 +143,18 @@ class SubDatetime(datetime): "dt, expected", [ pytest.param( - Timestamp("2000-01-01"), Timestamp("2000-01-01", tz=UTC), id="timestamp" + Timestamp("2000-01-01"), + Timestamp("2000-01-01", tz=timezone.utc), + id="timestamp", ), pytest.param( - datetime(2000, 1, 1), datetime(2000, 1, 1, tzinfo=UTC), id="datetime" + datetime(2000, 1, 1), + datetime(2000, 1, 1, tzinfo=timezone.utc), + id="datetime", ), pytest.param( SubDatetime(2000, 1, 1), - SubDatetime(2000, 1, 1, tzinfo=UTC), + SubDatetime(2000, 1, 1, tzinfo=timezone.utc), id="subclassed_datetime", ), ], @@ -157,5 +163,5 @@ def test_localize_pydatetime_dt_types(dt, expected): # GH 25851 # ensure that subclassed datetime works with # localize_pydatetime - result = conversion.localize_pydatetime(dt, UTC) + result = conversion.localize_pydatetime(dt, timezone.utc) assert result == expected diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py index e9da6b3cf991c..0e7705ad7ed94 100644 --- a/pandas/tests/tslibs/test_resolution.py +++ b/pandas/tests/tslibs/test_resolution.py @@ -1,6 +1,7 @@ +import datetime + import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( Resolution, @@ -8,8 +9,6 @@ ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -import pandas._testing as tm - def test_get_resolution_nano(): # don't return the fallback RESO_DAY @@ -23,7 +22,7 @@ def test_get_resolution_non_nano_data(): res = get_resolution(arr, None, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US - res = get_resolution(arr, pytz.UTC, NpyDatetimeUnit.NPY_FR_us.value) + res = get_resolution(arr, datetime.timezone.utc, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US @@ -49,16 +48,9 @@ def test_get_attrname_from_abbrev(freqstr, expected): @pytest.mark.parametrize("freq", ["H", "S"]) -def test_units_H_S_deprecated_from_attrname_to_abbrevs(freq): - # GH#52536 - msg = f"'{freq}' is deprecated and will be removed in a future version." - - with tm.assert_produces_warning(FutureWarning, match=msg): - Resolution.get_reso_from_freqstr(freq) - +def test_unit_H_S_raises(freq): + # GH#59143 + msg = f"Invalid frequency: {freq}" -@pytest.mark.parametrize("freq", ["T", "t", "L", "U", "N", "n"]) -def test_reso_abbrev_T_L_U_N_raises(freq): - msg = f"Frequency '{freq}' is no longer supported." with pytest.raises(ValueError, match=msg): Resolution.get_reso_from_freqstr(freq) diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 28e4889983fb9..8dd7060f21d59 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -6,7 +6,6 @@ import dateutil.tz import pytest -import pytz from pandas._libs.tslibs import ( conversion, @@ -22,10 +21,11 @@ def test_is_utc(utc_fixture): assert timezones.is_utc(tz) -@pytest.mark.parametrize("tz_name", list(pytz.common_timezones)) -def test_cache_keys_are_distinct_for_pytz_vs_dateutil(tz_name): - tz_p = timezones.maybe_get_tz(tz_name) - tz_d = timezones.maybe_get_tz("dateutil/" + tz_name) +def test_cache_keys_are_distinct_for_pytz_vs_dateutil(): + pytz = pytest.importorskip("pytz") + for tz_name in pytz.common_timezones: + tz_p = timezones.maybe_get_tz(tz_name) + tz_d = timezones.maybe_get_tz("dateutil/" + tz_name) if tz_d is None: pytest.skip(tz_name + ": dateutil does not know about this one") @@ -76,12 +76,15 @@ def test_tz_compare_utc(utc_fixture, utc_fixture2): @pytest.fixture( params=[ - (pytz.timezone("US/Eastern"), lambda tz, x: tz.localize(x)), + ("pytz/US/Eastern", lambda tz, x: tz.localize(x)), (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz)), ] ) def infer_setup(request): eastern, localize = request.param + if isinstance(eastern, str) and eastern.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + eastern = pytz.timezone(eastern.removeprefix("pytz/")) start_naive = datetime(2001, 1, 1) end_naive = datetime(2009, 1, 1) @@ -111,10 +114,10 @@ def test_infer_tz_compat(infer_setup): def test_infer_tz_utc_localize(infer_setup): _, _, start, end, start_naive, end_naive = infer_setup - utc = pytz.utc + utc = timezone.utc - start = utc.localize(start_naive) - end = utc.localize(end_naive) + start = start_naive.astimezone(utc) + end = end_naive.astimezone(utc) assert timezones.infer_tzinfo(start, end) is utc @@ -124,8 +127,8 @@ def test_infer_tz_mismatch(infer_setup, ordered): eastern, _, _, _, start_naive, end_naive = infer_setup msg = "Inputs must both have the same timezone" - utc = pytz.utc - start = utc.localize(start_naive) + utc = timezone.utc + start = start_naive.astimezone(utc) end = conversion.localize_pydatetime(end_naive, eastern) args = (start, end) if ordered else (end, start) @@ -139,7 +142,7 @@ def test_maybe_get_tz_invalid_types(): timezones.maybe_get_tz(44.0) with pytest.raises(TypeError, match=""): - timezones.maybe_get_tz(pytz) + timezones.maybe_get_tz(pytest) msg = "" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 07bdfca8f2f2d..9e32a33650591 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -61,11 +61,11 @@ def test_to_offset_negative(freqstr, expected): "2SMS-15D", "100foo", # Invalid leading +/- signs. - "+-1d", + "+-1D", "-+1h", "+1", "-7", - "+d", + "+D", "-m", # Invalid shortcut anchors. "SME-0", @@ -128,9 +128,14 @@ def test_to_offset_leading_zero(freqstr, expected): assert result.n == expected -@pytest.mark.parametrize("freqstr,expected", [("+1d", 1), ("+2h30min", 150)]) -def test_to_offset_leading_plus(freqstr, expected): - result = to_offset(freqstr) +@pytest.mark.parametrize( + "freqstr,expected,wrn", [("+1d", 1, FutureWarning), ("+2h30min", 150, None)] +) +def test_to_offset_leading_plus(freqstr, expected, wrn): + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(wrn, match=msg): + result = to_offset(freqstr) assert result.n == expected @@ -176,14 +181,6 @@ def test_anchored_shortcuts(shortcut, expected): assert result == expected -def test_to_offset_lowercase_frequency_w_deprecated(): - # GH#54939 - msg = "'w' is deprecated and will be removed in a future version" - - with tm.assert_produces_warning(FutureWarning, match=msg): - to_offset("2w") - - @pytest.mark.parametrize( "freq_depr", [ @@ -206,17 +203,7 @@ def test_to_offset_lowercase_frequency_raises(freq_depr): to_offset(freq_depr) -@pytest.mark.parametrize( - "freq_depr", - [ - "2H", - "2BH", - "2MIN", - "2S", - "2Us", - "2NS", - ], -) +@pytest.mark.parametrize("freq_depr", ["2MIN", "2Us", "2NS"]) def test_to_offset_uppercase_frequency_deprecated(freq_depr): # GH#54939 depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " @@ -224,3 +211,28 @@ def test_to_offset_uppercase_frequency_deprecated(freq_depr): with tm.assert_produces_warning(FutureWarning, match=depr_msg): to_offset(freq_depr) + + +@pytest.mark.parametrize( + "freq_depr,expected", + [ + ("2w", offsets.Week(2, weekday=6)), + ("2b", offsets.BusinessDay(2)), + ("2d", offsets.Day(2)), + ], +) +def test_to_offset_lowercase_frequency_deprecated(freq_depr, expected): + # GH#54939, GH#58998 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_offset(freq_depr) + assert result == expected + + +@pytest.mark.parametrize("freq", ["2H", "2BH", "2S"]) +def test_to_offset_uppercase_frequency_raises(freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + to_offset(freq) diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 35c896dc0090b..4ea6c805a2ee4 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -102,7 +102,8 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): tm.assert_frame_equal(result, expected) -def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit): +def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit, adjust): + # GH 54328 tz = tz_aware_fixture halflife = "23 days" times = ( @@ -112,8 +113,11 @@ def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit): ) data = np.arange(3) df = DataFrame(data) - result = df.ewm(halflife=halflife, times=times).mean() - expected = DataFrame([0.0, 0.5674161888241773, 1.545239952073459]) + result = df.ewm(halflife=halflife, times=times, adjust=adjust).mean() + if adjust: + expected = DataFrame([0.0, 0.5674161888241773, 1.545239952073459]) + else: + expected = DataFrame([0.0, 0.23762518642226227, 1.534926369128742]) tm.assert_frame_equal(result, expected) @@ -148,13 +152,56 @@ def test_ewm_getitem_attributes_retained(arg, adjust, ignore_na): assert result == expected -def test_ewma_times_adjust_false_raises(): - # GH 40098 +def test_ewma_times_adjust_false_with_disallowed_com(): + # GH 54328 + with pytest.raises( + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), + ): + Series(range(1)).ewm( + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + halflife="1D", + ) + + +def test_ewma_times_adjust_false_with_disallowed_alpha(): + # GH 54328 with pytest.raises( - NotImplementedError, match="times is not supported with adjust=False." + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), + ): + Series(range(1)).ewm( + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + alpha=0.5, + halflife="1D", + ) + + +def test_ewma_times_adjust_false_with_disallowed_span(): + # GH 54328 + with pytest.raises( + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), ): Series(range(1)).ewm( - 0.1, adjust=False, times=date_range("2000", freq="D", periods=1) + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + span=10, + halflife="1D", ) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 120470b09a92b..4d37c6d57f788 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -582,7 +582,7 @@ def test_groupby_rolling_string_index(self): groups = df.groupby("group") df["count_to_date"] = groups.cumcount() - rolling_groups = groups.rolling("10d", on="eventTime") + rolling_groups = groups.rolling("10D", on="eventTime") result = rolling_groups.apply(lambda df: df.shape[0]) expected = DataFrame( [ @@ -623,11 +623,14 @@ def test_groupby_rolling_count_closed_on(self, unit): "date": date_range(end="20190101", periods=6, unit=unit), } ) - result = ( - df.groupby("group") - .rolling("3d", on="date", closed="left")["column1"] - .count() - ) + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + df.groupby("group") + .rolling("3d", on="date", closed="left")["column1"] + .count() + ) dti = DatetimeIndex( [ "2018-12-27", diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 3695ab8bf6cd3..23b17c651f08d 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -319,7 +319,9 @@ def f(x): @td.skip_if_no("numba") def test_invalid_kwargs_nopython(): - with pytest.raises(NumbaUtilError, match="numba does not support kwargs with"): + with pytest.raises( + NumbaUtilError, match="numba does not support keyword-only arguments" + ): Series(range(1)).rolling(1).apply( lambda x: x, kwargs={"a": 1}, engine="numba", raw=True ) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index fc8d7f69b8180..af3194b5085c4 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -578,7 +578,7 @@ def test_missing_minp_zero_variable(): [np.nan] * 4, index=DatetimeIndex(["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"]), ) - result = x.rolling(Timedelta("2d"), min_periods=0).sum() + result = x.rolling(Timedelta("2D"), min_periods=0).sum() expected = Series(0.0, index=x.index) tm.assert_series_equal(result, expected) @@ -1153,7 +1153,7 @@ def test_timeoffset_as_window_parameter_for_corr(unit): index=dti, ) - res = df.rolling(window="3d").corr() + res = df.rolling(window="3D").corr() tm.assert_frame_equal(exp, res) @@ -1380,17 +1380,20 @@ def test_invalid_method(): Series(range(1)).rolling(1, method="foo") -@pytest.mark.parametrize("window", [1, "1d"]) -def test_rolling_descending_date_order_with_offset(window, frame_or_series): +def test_rolling_descending_date_order_with_offset(frame_or_series): # GH#40002 - idx = date_range(start="2020-01-01", end="2020-01-03", freq="1d") - obj = frame_or_series(range(1, 4), index=idx) - result = obj.rolling("1d", closed="left").sum() + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = date_range(start="2020-01-01", end="2020-01-03", freq="1d") + obj = frame_or_series(range(1, 4), index=idx) + result = obj.rolling("1d", closed="left").sum() + expected = frame_or_series([np.nan, 1, 2], index=idx) tm.assert_equal(result, expected) - result = obj.iloc[::-1].rolling("1d", closed="left").sum() - idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1d") + result = obj.iloc[::-1].rolling("1D", closed="left").sum() + idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1D") expected = frame_or_series([np.nan, 3, 2], index=idx) tm.assert_equal(result, expected) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 820b0134cc577..eacdaddfa28b0 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -101,7 +101,7 @@ def test_on(self, regular): # column is valid df = df.copy() df["C"] = date_range("20130101", periods=len(df)) - df.rolling(window="2d", on="C").sum() + df.rolling(window="2D", on="C").sum() # invalid columns msg = "window must be an integer" @@ -109,7 +109,7 @@ def test_on(self, regular): df.rolling(window="2d", on="B") # ok even though on non-selected - df.rolling(window="2d", on="C").B.sum() + df.rolling(window="2D", on="C").B.sum() def test_monotonic_on(self): # on/index must be monotonic @@ -682,7 +682,7 @@ def test_rolling_on_multi_index_level(self): [date_range("20190101", periods=3), range(2)], names=["date", "seq"] ), ) - result = df.rolling("10d", on=df.index.get_level_values("date")).sum() + result = df.rolling("10D", on=df.index.get_level_values("date")).sum() expected = DataFrame( {"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index ) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 8e51183138b5c..bf4ec2e551f01 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -4,7 +4,7 @@ datetime, timedelta, ) -from typing import Callable +from typing import TYPE_CHECKING import warnings from dateutil.relativedelta import ( @@ -35,6 +35,9 @@ Easter, ) +if TYPE_CHECKING: + from collections.abc import Callable + def next_monday(dt: datetime) -> datetime: """ diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index bdfb0b1cad8ae..165824bec131f 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -6,7 +6,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -19,7 +18,10 @@ from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import ( + Callable, + Mapping, + ) def deprecate( diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index c4fec39594407..7e18ebe40cfa8 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -45,7 +45,7 @@ def _get_sys_info() -> dict[str, JSONSerializable]: language_code, encoding = locale.getlocale() return { "commit": _get_commit_hash(), - "python": ".".join([str(i) for i in sys.version_info]), + "python": platform.python_version(), "python-bits": struct.calcsize("P") * 8, "OS": uname_result.system, "OS-release": uname_result.release, @@ -70,33 +70,25 @@ def _get_dependency_info() -> dict[str, JSONSerializable]: "pytz", "dateutil", # install / build, - "setuptools", "pip", "Cython", - # test - "pytest", - "hypothesis", # docs "sphinx", - # Other, need a min version - "blosc", - "feather", - "xlsxwriter", - "lxml.etree", - "html5lib", - "pymysql", - "psycopg2", - "jinja2", # Other, not imported. "IPython", - "pandas_datareader", ] + # Optional dependencies deps.extend(list(VERSIONS)) result: dict[str, JSONSerializable] = {} for modname in deps: - mod = import_optional_dependency(modname, errors="ignore") - result[modname] = get_version(mod) if mod else None + try: + mod = import_optional_dependency(modname, errors="ignore") + except Exception: + # Dependency conflicts may cause a non ImportError + result[modname] = "N/A" + else: + result[modname] = get_version(mod) if mod else None return result diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 48684c4810d2a..1c17587db72d4 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -27,14 +27,12 @@ def test_foo(): from __future__ import annotations import locale -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import pytest if TYPE_CHECKING: + from collections.abc import Callable from pandas._typing import F from pandas.compat import ( diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py index 153424e339c45..9838e371f0d00 100644 --- a/pandas/util/version/__init__.py +++ b/pandas/util/version/__init__.py @@ -8,11 +8,13 @@ from __future__ import annotations import collections -from collections.abc import Iterator +from collections.abc import ( + Callable, + Iterator, +) import itertools import re from typing import ( - Callable, SupportsInt, Tuple, Union, diff --git a/pyproject.toml b/pyproject.toml index d8a59ca2241a6..47fd540d67ab2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ authors = [ { name = 'The Pandas Development Team', email='pandas-dev@python.org' }, ] license = {file = 'LICENSE'} -requires-python = '>=3.9' +requires-python = '>=3.10' dependencies = [ "numpy>=1.23.5; python_version<'3.12'", "numpy>=1.26.0; python_version>='3.12'", @@ -43,7 +43,6 @@ classifiers = [ 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', @@ -146,7 +145,7 @@ parentdir_prefix = "pandas-" setup = ['--vsenv'] # For Windows [tool.cibuildwheel] -skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x" +skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" @@ -329,6 +328,8 @@ ignore = [ "RUF007", # mutable-class-default "RUF012", + # type-comparison + "E721", # Additional pylint rules # literal-membership @@ -531,7 +532,6 @@ module = [ "pandas._libs.*", "pandas._testing.*", # TODO "pandas.compat.numpy.function", # TODO - "pandas.compat.compressors", # TODO "pandas.core._numba.executor", # TODO "pandas.core.array_algos.masked_reductions", # TODO "pandas.core.array_algos.putmask", # TODO diff --git a/scripts/check_for_inconsistent_pandas_namespace.py b/scripts/check_for_inconsistent_pandas_namespace.py index 52eca6f6d93ac..ec0a4a408c800 100644 --- a/scripts/check_for_inconsistent_pandas_namespace.py +++ b/scripts/check_for_inconsistent_pandas_namespace.py @@ -27,10 +27,7 @@ Sequence, ) import sys -from typing import ( - NamedTuple, - Optional, -) +from typing import NamedTuple ERROR_MESSAGE = ( "{path}:{lineno}:{col_offset}: " @@ -89,7 +86,7 @@ def replace_inconsistent_pandas_namespace(visitor: Visitor, content: str) -> str def check_for_inconsistent_pandas_namespace( content: str, path: str, *, replace: bool -) -> Optional[str]: +) -> str | None: tree = ast.parse(content) visitor = Visitor() @@ -121,7 +118,7 @@ def check_for_inconsistent_pandas_namespace( return replace_inconsistent_pandas_namespace(visitor, content) -def main(argv: Optional[Sequence[str]] = None) -> None: +def main(argv: Sequence[str] | None = None) -> None: parser = argparse.ArgumentParser() parser.add_argument("paths", nargs="*") parser.add_argument("--replace", action="store_true") diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index ed7b9affe9a50..b832b6aa95198 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -39,8 +39,6 @@ classifiers = [ 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Topic :: Scientific/Engineering' diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index ba3123a07df4b..35f6ffb4980df 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -12,14 +12,14 @@ import argparse import ast -from collections.abc import Iterable +from collections.abc import ( + Callable, + Iterable, +) import sys import token import tokenize -from typing import ( - IO, - Callable, -) +from typing import IO PRIVATE_IMPORTS_TO_IGNORE: set[str] = { "_extension_array_shared_docs", diff --git a/web/pandas/pdeps/0001-purpose-and-guidelines.md b/web/pandas/pdeps/0001-purpose-and-guidelines.md index bb15b8f997b11..7f5f0326eba6c 100644 --- a/web/pandas/pdeps/0001-purpose-and-guidelines.md +++ b/web/pandas/pdeps/0001-purpose-and-guidelines.md @@ -285,3 +285,4 @@ hope can help clarify our meaning here: [51417]: https://github.com/pandas-dev/pandas/pull/51417 [28900]: https://github.com/pandas-dev/pandas/issues/28900 [35407]: https://github.com/pandas-dev/pandas/issues/35407 +[53576]: https://github.com/pandas-dev/pandas/pull/53576 diff --git a/web/pandas/pdeps/0006-ban-upcasting.md b/web/pandas/pdeps/0006-ban-upcasting.md index a86455b70c71a..ae5872186bf23 100644 --- a/web/pandas/pdeps/0006-ban-upcasting.md +++ b/web/pandas/pdeps/0006-ban-upcasting.md @@ -1,7 +1,7 @@ # PDEP-6: Ban upcasting in setitem-like operations - Created: 23 December 2022 -- Status: Accepted +- Status: Implemented - Discussion: [#39584](https://github.com/pandas-dev/pandas/pull/50402) - Author: [Marco Gorelli](https://github.com/MarcoGorelli) ([original issue](https://github.com/pandas-dev/pandas/issues/39584) by [Joris Van den Bossche](https://github.com/jorisvandenbossche)) - Revision: 1 @@ -244,3 +244,4 @@ Deprecate sometime in the 2.x releases (after 2.0.0 has already been released), ### PDEP History - 23 December 2022: Initial draft +- 4 July 2024: Change status to "implemented"