gigl.src.common.utils.BqUtils#

class gigl.src.common.utils.bq.BqUtils(project: str | None = None)#

Bases: object

Methods

__init__

check_columns_exist_in_table

count_number_of_columns_in_bq_table

count_number_of_rows_in_bq_table

create_bq_dataset

create_or_empty_bq_table

delete_bq_table_if_exist

bq_table_path = 'your-project.your_dataset.your_table'

delete_matching_tables

does_bq_table_exist

export_to_gcs

Export a BigQuery table to Google Cloud Storage.

fetch_bq_table_schema

Create a dictionary representation for SchemaFields from BigQuery table.

format_bq_path

Formats BQ paths.

get_dataset_name_from_table

get_table_names_within_date_range

start_date and end_date are in the format of 'YYYYMMDD' table_match_string is a regex string to match table names

join_path

list_matching_tables

load_file_to_bq

Uploads a single file to biqquery.

load_rows_to_bq

parse_bq_table_path

Parses a joined bq table path into its project, dataset, and table names Args: bq_table_path (str): Joined bq table path of format project.dataset.table Returns: bq_project_id (str): Parsed BQ Project ID bq_dataset_id (str): Parsed Dataset ID bq_table_name (str): Parsed Table Name

run_query

update_bq_dataset_retention

Update default retention for a whole BQ dataset.

update_bq_table_retention

Update retention of a single BQ table.

__init__(project: str | None = None) None#
__weakref__#

list of weak references to the object (if defined)

delete_bq_table_if_exist(bq_table_path: str, not_found_ok: bool = True) None#

bq_table_path = ‘your-project.your_dataset.your_table’

export_to_gcs(bq_table_path: str, destination_gcs_uri: GcsUri, destination_format: str = 'NEWLINE_DELIMITED_JSON') None#

Export a BigQuery table to Google Cloud Storage.

Args:

bq_table_path (str): The full BigQuery table path to export. destination_gcs_uri (str): The destination GCS URI where the table will be exported.

If the gcs uri has * in it, the table will be exported to multiple shards.

destination_format (str, optional): The format of the exported data. Defaults to ‘NEWLINE_DELIMITED_JSON’.

‘CSV’, ‘AVRO’, ‘PARQUET’ also available.

fetch_bq_table_schema(bq_table: str) Dict[str, SchemaField]#

Create a dictionary representation for SchemaFields from BigQuery table.

static format_bq_path(bq_path: str, format_for_table_reference: bool = False) str#

Formats BQ paths.

Args:
bq_path (str): expected to be one of:

“<project>.<dataset>.<table>” or “<project>:<dataset>.<table>” “<project>.<dataset>” or “<project>:<dataset>” “<dataset>.<table>”

format_for_table_reference (bool, optional): If project, dataset, and table are all specified; add the : seperator between project and dataset. Useful for when “table_reference” is required instead of path i.e. for using BigQuery IO operator for beam pipelines. Defaults to False.

Returns:

str: Formatted bq path

get_table_names_within_date_range(bq_dataset_path: str, table_match_string: str, start_date: str, end_date: str) List[str]#

start_date and end_date are in the format of ‘YYYYMMDD’ table_match_string is a regex string to match table names

load_file_to_bq(source_path: Uri, bq_path: str, job_config: LoadJobConfig, retry: bool = False) _AsyncJob#

Uploads a single file to biqquery.

Args:

source_path (Uri): The source file to upload. bq_path (str): The BigQuery table path to upload to. job_config (bigquery.LoadJobConfig): The job configuration for the upload. retry (bool, optional): Whether to retry the upload if it fails. Defaults to False.

Returns: The job object for the upload.

static parse_bq_table_path(bq_table_path: str) Tuple[str, str, str]#

Parses a joined bq table path into its project, dataset, and table names Args:

bq_table_path (str): Joined bq table path of format project.dataset.table

Returns:

bq_project_id (str): Parsed BQ Project ID bq_dataset_id (str): Parsed Dataset ID bq_table_name (str): Parsed Table Name

update_bq_dataset_retention(bq_dataset_path: str, retention_in_days: int, apply_retroactively: bool | None = False) None#

Update default retention for a whole BQ dataset. This applies only to new tables unless apply_retroactively=True.

Parameters:
  • bq_dataset_path – The BigQuery dataset path in the format project_id.dataset_id.

  • retention_in_days – The number of days to retain data in BigQuery tables.

  • apply_retroactively – If True, applies this retention policy retroactively to all existing tables in the dataset.

update_bq_table_retention(bq_table_path: str, retention_in_days: int) None#

Update retention of a single BQ table. :param bq_table_path: :param retention_in_days: :param client: :return: