python-bigquery-dataframes/tests/system/large/test_dataframe.py at 70c7bd08fb093ae101c4121143f4702918bd38ee · googleapis/python-bigquery-dataframes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import sys

import pandas as pd
import pytest


@pytest.mark.skipif(
    sys.version_info >= (3, 12),
    # See: https://github.com/python/cpython/issues/112282
    reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.",
)
def test_corr_150_columns(scalars_df_numeric_150_columns_maybe_ordered):
    scalars_df, scalars_pandas_df = scalars_df_numeric_150_columns_maybe_ordered
    bf_result = scalars_df.corr(numeric_only=True).to_pandas()
    pd_result = scalars_pandas_df.corr(numeric_only=True)

    pd.testing.assert_frame_equal(
        bf_result,
        pd_result,
        check_dtype=False,
        check_index_type=False,
        check_column_type=False,
    )


@pytest.mark.skipif(
    sys.version_info >= (3, 12),
    # See: https://github.com/python/cpython/issues/112282
    reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.",
)
def test_cov_150_columns(scalars_df_numeric_150_columns_maybe_ordered):
    scalars_df, scalars_pandas_df = scalars_df_numeric_150_columns_maybe_ordered
    bf_result = scalars_df.cov(numeric_only=True).to_pandas()
    pd_result = scalars_pandas_df.cov(numeric_only=True)

    pd.testing.assert_frame_equal(
        bf_result,
        pd_result,
        check_dtype=False,
        check_index_type=False,
        check_column_type=False,
    )


@pytest.mark.parametrize(
    ("keep",),
    [
        ("first",),
        ("last",),
        (False,),
    ],
)
def test_drop_duplicates_unordered(
    scalars_df_unordered, scalars_pandas_df_default_index, keep
):
    uniq_scalar_rows = scalars_df_unordered.drop_duplicates(
        subset="bool_col", keep=keep
    )
    uniq_pd_rows = scalars_pandas_df_default_index.drop_duplicates(
        subset="bool_col", keep=keep
    )

    assert len(uniq_scalar_rows) == len(uniq_pd_rows)
    assert len(uniq_scalar_rows.groupby("bool_col")) == len(
        uniq_pd_rows.groupby("bool_col")
    )