gigl.src.common.utils.BqUtils#

class gigl.src.common.utils.bq.BqUtils(project: str | None = None)#

Bases: object

Methods

__init__

check_columns_exist_in_table

count_number_of_columns_in_bq_table

count_number_of_rows_in_bq_table

create_bq_dataset

create_or_empty_bq_table

delete_bq_table_if_exist

bq_table_path = 'your-project.your_dataset.your_table'

delete_matching_tables

does_bq_table_exist

export_to_gcs

Export a BigQuery table to Google Cloud Storage.

fetch_bq_table_schema

Create a dictionary representation for SchemaFields from BigQuery table.

format_bq_path

Formats BQ paths.

get_dataset_name_from_table

get_table_names_within_date_range

start_date and end_date are in the format of 'YYYYMMDD' table_match_string is a regex string to match table names

join_path

list_matching_tables

load_file_to_bq

Uploads a single file to biqquery.

load_rows_to_bq

parse_bq_table_path

Parses a joined bq table path into its project, dataset, and table names Args: bq_table_path (str): Joined bq table path of format project.dataset.table Returns: bq_project_id (str): Parsed BQ Project ID bq_dataset_id (str): Parsed Dataset ID bq_table_name (str): Parsed Table Name

run_query

update_bq_dataset_retention

Update default retention for a whole BQ dataset.

update_bq_table_retention

Update retention of a single BQ table.

__init__(project: str | None = None) → None#

__weakref__#: list of weak references to the object (if defined)

delete_bq_table_if_exist(bq_table_path: str, not_found_ok: bool = True) → None#: bq_table_path = ‘your-project.your_dataset.your_table’

export_to_gcs(bq_table_path: str, destination_gcs_uri: GcsUri, destination_format: str = 'NEWLINE_DELIMITED_JSON') → None#

Export a BigQuery table to Google Cloud Storage.

Args:

bq_table_path (str): The full BigQuery table path to export. destination_gcs_uri (str): The destination GCS URI where the table will be exported.

If the gcs uri has * in it, the table will be exported to multiple shards.

destination_format (str, optional): The format of the exported data. Defaults to ‘NEWLINE_DELIMITED_JSON’.: ‘CSV’, ‘AVRO’, ‘PARQUET’ also available.

fetch_bq_table_schema(bq_table: str) → Dict[str, SchemaField]#: Create a dictionary representation for SchemaFields from BigQuery table.

static format_bq_path(bq_path: str, format_for_table_reference: bool = False) → str#

Formats BQ paths.

Args:

bq_path (str): expected to be one of:: “<project>.<dataset>.<table>” or “<project>:<dataset>.<table>” “<project>.<dataset>” or “<project>:<dataset>” “<dataset>.<table>”

format_for_table_reference (bool, optional): If project, dataset, and table are all specified; add the : seperator between project and dataset. Useful for when “table_reference” is required instead of path i.e. for using BigQuery IO operator for beam pipelines. Defaults to False.

Returns:

str: Formatted bq path

get_table_names_within_date_range(bq_dataset_path: str, table_match_string: str, start_date: str, end_date: str) → List[str]#: start_date and end_date are in the format of ‘YYYYMMDD’ table_match_string is a regex string to match table names

load_file_to_bq(source_path: Uri, bq_path: str, job_config: LoadJobConfig, retry: bool = False) → _AsyncJob#

Uploads a single file to biqquery.

Args:: source_path (Uri): The source file to upload. bq_path (str): The BigQuery table path to upload to. job_config (bigquery.LoadJobConfig): The job configuration for the upload. retry (bool, optional): Whether to retry the upload if it fails. Defaults to False.

Returns: The job object for the upload.

static parse_bq_table_path(bq_table_path: str) → Tuple[str, str, str]#

Parses a joined bq table path into its project, dataset, and table names Args:

bq_table_path (str): Joined bq table path of format project.dataset.table

Returns:: bq_project_id (str): Parsed BQ Project ID bq_dataset_id (str): Parsed Dataset ID bq_table_name (str): Parsed Table Name

update_bq_dataset_retention(bq_dataset_path: str, retention_in_days: int, apply_retroactively: bool | None = False) → None#

Update default retention for a whole BQ dataset. This applies only to new tables unless apply_retroactively=True.

Parameters:

bq_dataset_path – The BigQuery dataset path in the format project_id.dataset_id.
retention_in_days – The number of days to retain data in BigQuery tables.
apply_retroactively – If True, applies this retention policy retroactively to all existing tables in the dataset.

update_bq_table_retention(bq_table_path: str, retention_in_days: int) → None#: Update retention of a single BQ table. :param bq_table_path: :param retention_in_days: :param client: :return:

`__init__`
`check_columns_exist_in_table`
`count_number_of_columns_in_bq_table`
`count_number_of_rows_in_bq_table`
`create_bq_dataset`
`create_or_empty_bq_table`
`delete_bq_table_if_exist`	bq_table_path = 'your-project.your_dataset.your_table'
`delete_matching_tables`
`does_bq_table_exist`
`export_to_gcs`	Export a BigQuery table to Google Cloud Storage.
`fetch_bq_table_schema`	Create a dictionary representation for SchemaFields from BigQuery table.
`format_bq_path`	Formats BQ paths.
`get_dataset_name_from_table`
`get_table_names_within_date_range`	start_date and end_date are in the format of 'YYYYMMDD' table_match_string is a regex string to match table names
`join_path`
`list_matching_tables`
`load_file_to_bq`	Uploads a single file to biqquery.
`load_rows_to_bq`
`parse_bq_table_path`	Parses a joined bq table path into its project, dataset, and table names Args: bq_table_path (str): Joined bq table path of format project.dataset.table Returns: bq_project_id (str): Parsed BQ Project ID bq_dataset_id (str): Parsed Dataset ID bq_table_name (str): Parsed Table Name
`run_query`
`update_bq_dataset_retention`	Update default retention for a whole BQ dataset.
`update_bq_table_retention`	Update retention of a single BQ table.