Merge remote-tracking branch 'upstream/master' into feature/add-flake8-code-check

tswast · tswast · commit 26dffb496615 · 2020-11-18T13:34:33.000-06:00
diff --git a/README.rst b/README.rst
@@ -54,11 +54,14 @@ To specify location of your datasets pass ``location`` to ``create_engine()``:
 Table names
 ___________
 
-To query tables from non-default projects, use the following format for the table name: ``project.dataset.table``, e.g.:
+To query tables from non-default projects or datasets, use the following format for the SQLAlchemy schema name: ``[project.]dataset``, e.g.:
 
 .. code-block:: python
 
-    sample_table = Table('bigquery-public-data.samples.natality')
+    # If neither dataset nor project are the default
+    sample_table_1 = Table('natality', schema='bigquery-public-data.samples')
+    # If just dataset is not the default
+    sample_table_2 = Table('natality', schema='bigquery-public-data')
 
 Batch size
 __________
@@ -85,7 +88,7 @@ When using a default dataset, don't include the dataset name in the table name,
 
     table = Table('table_name')
 
-Note that specyfing a default dataset doesn't restrict execution of queries to that particular dataset when using raw queries, e.g.:
+Note that specifying a default dataset doesn't restrict execution of queries to that particular dataset when using raw queries, e.g.:
 
 .. code-block:: python
 
diff --git a/pybigquery/sqlalchemy_bigquery.py b/pybigquery/sqlalchemy_bigquery.py
@@ -3,6 +3,8 @@
 from __future__ import absolute_import
 from __future__ import unicode_literals
 
+import operator
+
 from google import auth
 from google.cloud import bigquery
 from google.cloud.bigquery import dbapi
@@ -184,12 +186,19 @@ def visit_column(self, column, add_to_result_map=None,
                 self.preparer.quote(tablename) + \
                 "." + name
 
-    def visit_label(self, *args, **kwargs):
-        # Use labels in GROUP BY clause
-        if len(kwargs) == 0 or len(kwargs) == 1:
+    def visit_label(self, *args, within_group_by=False, **kwargs):
+        # Use labels in GROUP BY clause.
+        #
+        # Flag set in the group_by_clause method. Works around missing
+        # equivalent to supports_simple_order_by_label for group by.
+        if within_group_by:
             kwargs['render_label_as_label'] = args[0]
-        result = super(BigQueryCompiler, self).visit_label(*args, **kwargs)
-        return result
+        return super(BigQueryCompiler, self).visit_label(*args, **kwargs)
+
+    def group_by_clause(self, select, **kw):
+        return super(BigQueryCompiler, self).group_by_clause(
+            select, **kw, within_group_by=True
+        )
 
 
 class BigQueryTypeCompiler(GenericTypeCompiler):
@@ -206,6 +215,9 @@ def visit_text(self, type_, **kw):
     def visit_string(self, type_, **kw):
         return 'STRING'
 
+    def visit_ARRAY(self, type_, **kw):
+        return "ARRAY<{}>".format(self.process(type_.item_type, **kw))
+
     def visit_BINARY(self, type_, **kw):
         return 'BYTES'
 
@@ -284,6 +296,11 @@ def __init__(
     def dbapi(cls):
         return dbapi
 
+    @staticmethod
+    def _build_formatted_table_id(table):
+        """Build '<dataset_id>.<table_id>' string using given table."""
+        return "{}.{}".format(table.reference.dataset_id, table.table_id)
+
     @staticmethod
     def _add_default_dataset_to_job_config(job_config, project_id, dataset_id):
         # If dataset_id is set, then we know the job_config isn't None
@@ -349,6 +366,26 @@ def _json_deserializer(self, row):
         """
         return row
 
+    def _get_table_or_view_names(self, connection, table_type, schema=None):
+        current_schema = schema or self.dataset_id
+        get_table_name = self._build_formatted_table_id \
+            if self.dataset_id is None else \
+            operator.attrgetter("table_id")
+
+        client = connection.connection._client
+        datasets = client.list_datasets()
+
+        result = []
+        for dataset in datasets:
+            if current_schema is not None and current_schema != dataset.dataset_id:
+                continue
+
+            tables = client.list_tables(dataset.reference)
+            for table in tables:
+                if table_type == table.table_type:
+                    result.append(get_table_name(table))
+        return result
+
     @staticmethod
     def _split_table_name(full_table_name):
         # Split full_table_name to get project, dataset and table name
@@ -363,22 +400,51 @@ def _split_table_name(full_table_name):
             dataset, table_name = table_name_split
         elif len(table_name_split) == 3:
             project, dataset, table_name = table_name_split
+        else:
+            raise ValueError("Did not understand table_name: {}".format(full_table_name))
 
         return (project, dataset, table_name)
 
+    def _table_reference(self, provided_schema_name, provided_table_name,
+                         client_project):
+        project_id_from_table, dataset_id_from_table, table_id = self._split_table_name(provided_table_name)
+        project_id_from_schema = None
+        dataset_id_from_schema = None
+        if provided_schema_name is not None:
+            provided_schema_name_split = provided_schema_name.split('.')
+            if len(provided_schema_name_split) == 0:
+                pass
+            elif len(provided_schema_name_split) == 1:
+                if dataset_id_from_table:
+                    project_id_from_schema = provided_schema_name_split[0]
+                else:
+                    dataset_id_from_schema = provided_schema_name_split[0]
+            elif len(provided_schema_name_split) == 2:
+                project_id_from_schema = provided_schema_name_split[0]
+                dataset_id_from_schema = provided_schema_name_split[1]
+            else:
+                raise ValueError("Did not understand schema: {}".format(provided_schema_name))
+        if (dataset_id_from_schema and dataset_id_from_table and
+           dataset_id_from_schema != dataset_id_from_table):
+            raise ValueError("dataset_id specified in schema and table_name disagree: got {} in schema, and {} in table_name".format(dataset_id_from_schema, dataset_id_from_table))
+        if (project_id_from_schema and project_id_from_table and
+           project_id_from_schema != project_id_from_table):
+            raise ValueError("project_id specified in schema and table_name disagree: got {} in schema, and {} in table_name".format(project_id_from_schema, project_id_from_table))
+        project_id = project_id_from_schema or project_id_from_table or client_project
+        dataset_id = dataset_id_from_schema or dataset_id_from_table or self.dataset_id
+
+        table_ref = TableReference.from_string("{}.{}.{}".format(
+            project_id, dataset_id, table_id
+        ))
+        return table_ref
+
     def _get_table(self, connection, table_name, schema=None):
         if isinstance(connection, Engine):
             connection = connection.connect()
 
         client = connection.connection._client
 
-        project_id, dataset_id, table_id = self._split_table_name(table_name)
-        project_id = project_id or client.project
-        dataset_id = dataset_id or schema or self.dataset_id
-
-        table_ref = TableReference.from_string("{}.{}.{}".format(
-            project_id, dataset_id, table_id
-        ))
+        table_ref = self._table_reference(schema, table_name, client.project)
         try:
             table = client.get_table(table_ref)
         except NotFound:
@@ -464,23 +530,13 @@ def get_table_names(self, connection, schema=None, **kw):
         if isinstance(connection, Engine):
             connection = connection.connect()
 
-        datasets = connection.connection._client.list_datasets()
-        result = []
-        for d in datasets:
-            if schema is not None and d.dataset_id != schema:
-                continue
+        return self._get_table_or_view_names(connection, "TABLE", schema)
 
-            if self.dataset_id is not None and d.dataset_id != self.dataset_id:
-                continue
+    def get_view_names(self, connection, schema=None, **kw):
+        if isinstance(connection, Engine):
+            connection = connection.connect()
 
-            tables = connection.connection._client.list_tables(d.reference)
-            for t in tables:
-                if self.dataset_id is None:
-                    table_name = d.dataset_id + '.' + t.table_id
-                else:
-                    table_name = t.table_id
-                result.append(table_name)
-        return result
+        return self._get_table_or_view_names(connection, "VIEW", schema)
 
     def do_rollback(self, dbapi_connection):
         # BigQuery has no support for transactions.
diff --git a/scripts/load_test_data.sh b/scripts/load_test_data.sh
@@ -6,6 +6,7 @@ bq rm -f -t test_pybigquery.sample
 bq rm -f -t test_pybigquery_alt.sample_alt
 bq rm -f -t test_pybigquery.sample_one_row
 bq rm -f -t test_pybigquery.sample_dml
+bq rm -f -t test_pybigquery.sample_view
 bq rm -f -t test_pybigquery_location.sample_one_row
 
 bq mk --table --schema=$(dirname $0)/schema.json --time_partitioning_field timestamp --clustering_fields integer,string test_pybigquery.sample
@@ -17,3 +18,5 @@ bq load --source_format=NEWLINE_DELIMITED_JSON --schema=$(dirname $0)/schema.jso
 
 bq --location=asia-northeast1 load --source_format=NEWLINE_DELIMITED_JSON --schema=$(dirname $0)/schema.json test_pybigquery_location.sample_one_row $(dirname $0)/sample_one_row.json
 bq mk --schema=$(dirname $0)/schema.json -t test_pybigquery.sample_dml
+
+bq mk --use_legacy_sql=false --view 'SELECT string FROM test_pybigquery.sample' test_pybigquery.sample_view
diff --git a/test/test_sqlalchemy_bigquery.py b/test/test_sqlalchemy_bigquery.py
@@ -3,6 +3,7 @@
 
 from google.api_core.exceptions import BadRequest
 from pybigquery.api import ApiClient
+from pybigquery.sqlalchemy_bigquery import BigQueryDialect
 from sqlalchemy.engine import create_engine
 from sqlalchemy.schema import Table, MetaData, Column
 from sqlalchemy.ext.declarative import declarative_base
@@ -102,6 +103,11 @@ def engine():
     return engine
 
 
+@pytest.fixture(scope='session')
+def dialect():
+    return BigQueryDialect()
+
+
 @pytest.fixture(scope='session')
 def engine_using_test_dataset():
     engine = create_engine('bigquery:///test_pybigquery', echo=True)
@@ -163,10 +169,14 @@ def query():
     def query(table):
         col1 = literal_column("TIMESTAMP_TRUNC(timestamp, DAY)").label("timestamp_label")
         col2 = func.sum(table.c.integer)
+        # Test rendering of nested labels. Full expression should render in SELECT, but
+        # ORDER/GROUP BY should use label only.
+        col3 = func.sum(func.sum(table.c.integer.label("inner")).label("outer")).over().label('outer')
         query = (
             select([
                 col1,
                 col2,
+                col3,
             ])
             .where(col1 < '2017-01-01 00:00:00')
             .group_by(col1)
@@ -284,11 +294,13 @@ def test_tables_list(engine, engine_using_test_dataset):
     assert 'test_pybigquery.sample' in tables
     assert 'test_pybigquery.sample_one_row' in tables
     assert 'test_pybigquery.sample_dml' in tables
+    assert 'test_pybigquery.sample_view' not in tables
 
     tables = engine_using_test_dataset.table_names()
     assert 'sample' in tables
     assert 'sample_one_row' in tables
     assert 'sample_dml' in tables
+    assert 'sample_view' not in tables
 
 
 def test_group_by(session, table, session_using_test_dataset, table_using_test_dataset):
@@ -298,6 +310,33 @@ def test_group_by(session, table, session_using_test_dataset, table_using_test_d
     assert len(result) > 0
 
 
+def test_nested_labels(engine, table):
+    col = table.c.integer
+    exprs = [
+        sqlalchemy.func.sum(
+            sqlalchemy.func.sum(col.label("inner")
+        ).label("outer")).over(),
+        sqlalchemy.func.sum(
+            sqlalchemy.case([[
+                sqlalchemy.literal(True),
+                col.label("inner"),
+            ]]).label("outer")
+        ),
+        sqlalchemy.func.sum(
+            sqlalchemy.func.sum(
+                sqlalchemy.case([[
+                    sqlalchemy.literal(True), col.label("inner")
+                ]]).label("middle")
+            ).label("outer")
+        ).over(),
+    ]
+    for expr in exprs:
+        sql = str(expr.compile(engine))
+        assert "inner" not in sql
+        assert "middle" not in sql
+        assert "outer" not in sql
+
+
 def test_session_query(session, table, session_using_test_dataset, table_using_test_dataset):
     for session, table in [(session, table), (session_using_test_dataset, table_using_test_dataset)]:
         col_concat = func.concat(table.c.string).label('concat')
@@ -359,6 +398,16 @@ def test_compiled_query_literal_binds(engine, engine_using_test_dataset, table,
     assert len(result) > 0
 
 
+@pytest.mark.parametrize(["column", "processed"], [
+    (types.String(), "STRING"),
+    (types.NUMERIC(), "NUMERIC"),
+    (types.ARRAY(types.String), "ARRAY<STRING>"),
+])
+def test_compile_types(engine, column, processed):
+    result = engine.dialect.type_compiler.process(column)
+    assert result == processed
+
+
 def test_joins(session, table, table_one_row):
     result = (session.query(table.c.string, func.count(table_one_row.c.integer))
                      .join(table_one_row, table_one_row.c.string == table.c.string)
@@ -438,15 +487,27 @@ def test_table_names_in_schema(inspector, inspector_using_test_dataset):
     assert 'test_pybigquery.sample' in tables
     assert 'test_pybigquery.sample_one_row' in tables
     assert 'test_pybigquery.sample_dml' in tables
+    assert 'test_pybigquery.sample_view' not in tables
     assert len(tables) == 3
 
     tables = inspector_using_test_dataset.get_table_names()
     assert 'sample' in tables
     assert 'sample_one_row' in tables
     assert 'sample_dml' in tables
+    assert 'sample_view' not in tables
     assert len(tables) == 3
 
 
+def test_view_names(inspector, inspector_using_test_dataset):
+    view_names = inspector.get_view_names()
+    assert "test_pybigquery.sample_view" in view_names
+    assert "test_pybigquery.sample" not in view_names
+
+    view_names = inspector_using_test_dataset.get_view_names()
+    assert "sample_view" in view_names
+    assert "sample" not in view_names
+
+
 def test_get_indexes(inspector, inspector_using_test_dataset):
     for _ in ['test_pybigquery.sample', 'test_pybigquery.sample_one_row']:
         indexes = inspector.get_indexes('test_pybigquery.sample')
@@ -479,6 +540,38 @@ def test_get_columns(inspector, inspector_using_test_dataset):
             assert col['type'].__class__.__name__ == sample_col['type'].__class__.__name__
 
 
+@pytest.mark.parametrize('provided_schema_name,provided_table_name,client_project',
+                         [
+                             ('dataset', 'table', 'project'),
+                             (None, 'dataset.table', 'project'),
+                             (None, 'project.dataset.table', 'other_project'),
+                             ('project', 'dataset.table', 'other_project'),
+                             ('project.dataset', 'table', 'other_project'),
+                         ])
+def test_table_reference(dialect, provided_schema_name,
+                         provided_table_name, client_project):
+    ref = dialect._table_reference(provided_schema_name,
+                                   provided_table_name,
+                                   client_project)
+    assert ref.table_id == 'table'
+    assert ref.dataset_id == 'dataset'
+    assert ref.project == 'project'
+
+@pytest.mark.parametrize('provided_schema_name,provided_table_name,client_project',
+                         [
+                             ('project.dataset', 'other_dataset.table', 'project'),
+                             ('project.dataset', 'other_project.dataset.table', 'project'),
+                             ('project.dataset.something_else', 'table', 'project'),
+                             (None, 'project.dataset.table.something_else', 'project'),
+                         ])
+def test_invalid_table_reference(dialect, provided_schema_name,
+                                 provided_table_name, client_project):
+    with pytest.raises(ValueError):
+        dialect._table_reference(provided_schema_name,
+                                 provided_table_name,
+                                 client_project)
+
+
 def test_has_table(engine, engine_using_test_dataset):
     assert engine.has_table('sample', 'test_pybigquery') is True
     assert engine.has_table('test_pybigquery.sample') is True