Skip to content

Commit f106b7c

Browse files
authored
feat!: GCS to BigTable catalog from GCS bucket (#1026)
* feat!: GCS to BigTable catalog from GCS bucket * fix: provide google cloud storage package dependency * fix: test case * fix: test case * fix: test case * fix: simplify test cases
1 parent 6af6cc4 commit f106b7c

File tree

5 files changed

+88
-112
lines changed

5 files changed

+88
-112
lines changed

python/.ci/Jenkinsfile

Lines changed: 71 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -751,7 +751,76 @@ pipeline {
751751
steps{
752752
retry(count: stageRetryCount) {
753753
sh '''
754-
754+
755+
cat > /tmp/cities.json << EOF
756+
{
757+
"table": {
758+
"name": "cities"
759+
},
760+
"rowkey": "key",
761+
"columns": {
762+
"key": {
763+
"cf": "rowkey",
764+
"col": "key",
765+
"type": "string"
766+
},
767+
"LatD": {
768+
"cf": "lat",
769+
"col": "LatD",
770+
"type": "string"
771+
},
772+
"LatM": {
773+
"cf": "lat",
774+
"col": "LatM",
775+
"type": "string"
776+
},
777+
"LatS": {
778+
"cf": "lat",
779+
"col": "LatS",
780+
"type": "string"
781+
},
782+
"NS": {
783+
"cf": "lat",
784+
"col": "NS",
785+
"type": "string"
786+
},
787+
"LonD": {
788+
"cf": "lon",
789+
"col": "LonD",
790+
"type": "string"
791+
},
792+
"LonM": {
793+
"cf": "lon",
794+
"col": "LonM",
795+
"type": "string"
796+
},
797+
"LonS": {
798+
"cf": "lon",
799+
"col": "LonS",
800+
"type": "string"
801+
},
802+
"EW": {
803+
"cf": "lon",
804+
"col": "EW",
805+
"type": "string"
806+
},
807+
"City": {
808+
"cf": "place",
809+
"col": "City",
810+
"type": "string"
811+
},
812+
"State": {
813+
"cf": "place",
814+
"col": "State",
815+
"type": "string"
816+
}
817+
}
818+
}
819+
EOF
820+
821+
822+
cat /tmp/cities.json
823+
gsutil cp /tmp/cities.json gs://dataproc-templates/conf/
755824
export GCS_STAGING_LOCATION="gs://python-dataproc-templates-temp"
756825
export SKIP_BUILD=true
757826
export JARS="gs://spark-lib/bigtable/spark-bigtable_2.12-0.1.0.jar"
@@ -765,23 +834,7 @@ pipeline {
765834
--gcs.bigtable.input.location="gs://dataproc-templates/data/csv/cities.csv" \
766835
--spark.bigtable.project.id=$GCP_PROJECT \
767836
--spark.bigtable.instance.id=$ENV_TEST_BIGTABLE_INSTANCE \
768-
--gcs.bigtable.catalog.json=\'\'\'{
769-
"table":{"name":"cities"},
770-
"rowkey":"key",
771-
"columns":{
772-
"key":{"cf":"rowkey", "col":"key", "type":"string"},
773-
"LatD":{"cf":"lat", "col":"LatD", "type":"string"},
774-
"LatM":{"cf":"lat", "col":"LatM", "type":"string"},
775-
"LatS":{"cf":"lat", "col":"LatS", "type":"string"},
776-
"NS":{"cf":"lat", "col":"NS", "type":"string"},
777-
"LonD":{"cf":"lon", "col":"LonD", "type":"string"},
778-
"LonM":{"cf":"lon", "col":"LonM", "type":"string"},
779-
"LonS":{"cf":"lon", "col":"LonS", "type":"string"},
780-
"EW":{"cf":"lon", "col":"EW", "type":"string"},
781-
"City":{"cf":"place", "col":"City", "type":"string"},
782-
"State":{"cf":"place", "col":"State", "type":"string"}
783-
}
784-
}\'\'\'
837+
--gcs.bigtable.catalog.json="gs://dataproc-templates/conf/cities.json"
785838
'''
786839
}
787840
}

python/dataproc_templates/gcs/README.md

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ It also requires [DeltaIO dependencies](https://docs.delta.io/latest/releases.ht
229229
* `gcs.bigquery.input.format`: Input file format (one of: avro,parquet,csv,json,delta)
230230
* `spark.bigtable.project.id`: GCP project where BigTable instance is running
231231
* `spark.bigtable.instance.id`: BigTable instance id
232-
* `gcs.bigtable.catalog.json`: BigTable catalog inline json
232+
* `gcs.bigtable.catalog.json`: BigTable catalog json file GCS path
233233
#### Optional Arguments
234234
* `gcs.bigtable.input.chartoescapequoteescaping`: Sets a single character used for escaping the escape for the quote character. The default value is escape character when escape and quote characters are different, \0 otherwise
235235
* `gcs.bigtable.input.columnnameofcorruptrecord`: Allows renaming the new field having malformed string created by PERMISSIVE mode
@@ -373,7 +373,7 @@ options:
373373
--spark.bigtable.instance.id SPARK.BIGTABLE.INSTANCE.ID
374374
BigTable instance id
375375
--gcs.bigtable.catalog.json GCS.BT.CATALOG.JSON
376-
BigTable catalog inline json
376+
BigTable catalog json file GCS path
377377
```
378378
379379
## Example submission
@@ -392,14 +392,7 @@ export SPARK_PROPERTIES="spark.jars.packages=org.slf4j:slf4j-reload4j:1.7.36"
392392
--gcs.bigtable.input.header="false" \
393393
--spark.bigtable.project.id="<GCP_PROJECT>" \
394394
--spark.bigtable.instance.id="<BIGTABLE_INSTANCE_ID>" \
395-
--gcs.bigtable.catalog.json='''{
396-
"table":{"name":"my_table"},
397-
"rowkey":"key",
398-
"columns":{
399-
"key":{"cf":"rowkey", "col":"key", "type":"string"},
400-
"name":{"cf":"cf", "col":"name", "type":"string"}
401-
}
402-
}'''
395+
--gcs.bigtable.catalog.json="<gs://bucket/path>"
403396
```
404397
405398

python/dataproc_templates/gcs/gcs_to_bigtable.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from dataproc_templates.util.argument_parsing import add_spark_options
2424
from dataproc_templates.util.dataframe_reader_wrappers import ingest_dataframe_from_cloud_storage
2525
import dataproc_templates.util.template_constants as constants
26+
from google.cloud import storage
2627

2728

2829
__all__ = ['GCSToBigTableTemplate']
@@ -87,7 +88,7 @@ def parse_args(args: Optional[Sequence[str]] = None) -> Dict[str, Any]:
8788
f'--{constants.GCS_BT_CATALOG_JSON}',
8889
dest=constants.GCS_BT_CATALOG_JSON,
8990
required=True,
90-
help='BigTable catalog inline json'
91+
help='BigTable catalog json stored file GCS location'
9192
)
9293

9394
known_args: argparse.Namespace
@@ -102,7 +103,6 @@ def run(self, spark: SparkSession, args: Dict[str, Any]) -> None:
102103
# Arguments
103104
input_location: str = args[constants.GCS_BT_INPUT_LOCATION]
104105
input_format: str = args[constants.GCS_BT_INPUT_FORMAT]
105-
catalog: str = ''.join(args[constants.GCS_BT_CATALOG_JSON].split())
106106
project_id: str = args[constants.GCS_BT_PROJECT_ID]
107107
instance_id: str = args[constants.GCS_BT_INSTANCE_ID]
108108
create_new_table: bool = args[constants.GCS_BT_CREATE_NEW_TABLE]
@@ -114,6 +114,15 @@ def run(self, spark: SparkSession, args: Dict[str, Any]) -> None:
114114
f"{pprint.pformat(args)}"
115115
)
116116

117+
# Read Catalog From GCS
118+
storage_client = storage.Client()
119+
bucket = storage_client.bucket(args[constants.GCS_BT_CATALOG_JSON].split('/')[2])
120+
blob = bucket.blob('/'.join(args[constants.GCS_BT_CATALOG_JSON].split('/')[3:]))
121+
catalog = blob.download_as_text()
122+
123+
logger.info(f"Catalog: {catalog}")
124+
125+
117126
# Read
118127
input_data = ingest_dataframe_from_cloud_storage(
119128
spark, args, input_location, input_format, "gcs.bigtable.input."

python/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ pyarrow==16.1.0
1111
pandas==2.0.3
1212
numpy==1.24.4
1313
pyspark==3.5.1
14+
google-cloud-storage==2.18.2

python/test/gcs/test_gcs_to_bigtable.py

Lines changed: 2 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -35,90 +35,10 @@ def test_parse_args(self):
3535
"--gcs.bigtable.input.location=gs://test",
3636
"--spark.bigtable.project.id=GCP_PROJECT",
3737
"--spark.bigtable.instance.id=BIGTABLE_INSTANCE_ID",
38-
"--gcs.bigtable.catalog.json={key:value}"])
38+
"--gcs.bigtable.catalog.json=gs://dataproc-templates/conf/employeecatalog.json"])
3939

4040
assert parsed_args["gcs.bigtable.input.format"] == "parquet"
4141
assert parsed_args["gcs.bigtable.input.location"] == "gs://test"
4242
assert parsed_args["spark.bigtable.project.id"] == "GCP_PROJECT"
4343
assert parsed_args["spark.bigtable.instance.id"] == "BIGTABLE_INSTANCE_ID"
44-
assert parsed_args["gcs.bigtable.catalog.json"] == '{key:value}'
45-
46-
@mock.patch.object(pyspark.sql, 'SparkSession')
47-
def test_run(self, mock_spark_session):
48-
"""Tests GCSToBigTableTemplate runs"""
49-
50-
gcs_to_bigtable_template = GCSToBigTableTemplate()
51-
mock_parsed_args = gcs_to_bigtable_template.parse_args(
52-
["--gcs.bigtable.input.format=parquet",
53-
"--gcs.bigtable.input.location=gs://test",
54-
"--spark.bigtable.project.id=GCP_PROJECT",
55-
"--spark.bigtable.instance.id=BIGTABLE_INSTANCE_ID",
56-
"--gcs.bigtable.catalog.json={key:value}"])
57-
mock_spark_session.read.parquet.return_value = mock_spark_session.dataframe.DataFrame
58-
gcs_to_bigtable_template.run(mock_spark_session, mock_parsed_args)
59-
60-
mock_spark_session.read.parquet.assert_called_once_with("gs://test")
61-
mock_spark_session.dataframe.DataFrame.write.format. \
62-
assert_called_once_with(constants.FORMAT_BIGTABLE)
63-
mock_spark_session.dataframe.DataFrame.write.format().options. \
64-
assert_called_with(catalog='{key:value}')
65-
66-
@mock.patch.object(pyspark.sql, 'SparkSession')
67-
def test_run_csv1(self, mock_spark_session):
68-
"""Tests GCSToBigTableTemplate runs with csv format"""
69-
70-
gcs_to_bigtable_template = GCSToBigTableTemplate()
71-
mock_parsed_args = gcs_to_bigtable_template.parse_args(
72-
["--gcs.bigtable.input.format=csv",
73-
"--gcs.bigtable.input.location=gs://test",
74-
"--gcs.bigtable.input.header=false",
75-
"--spark.bigtable.project.id=GCP_PROJECT",
76-
"--spark.bigtable.instance.id=BIGTABLE_INSTANCE_ID",
77-
"--gcs.bigtable.catalog.json={key:value}"])
78-
mock_spark_session.read.format().options().load.return_value = mock_spark_session.dataframe.DataFrame
79-
gcs_to_bigtable_template.run(mock_spark_session, mock_parsed_args)
80-
81-
mock_spark_session.read.format.assert_called_with(
82-
constants.FORMAT_CSV)
83-
mock_spark_session.read.format().options.assert_called_with(**{
84-
constants.CSV_HEADER: 'false',
85-
constants.CSV_INFER_SCHEMA: 'true',
86-
})
87-
mock_spark_session.read.format().options().load.assert_called_once_with("gs://test")
88-
mock_spark_session.dataframe.DataFrame.write.format. \
89-
assert_called_once_with(constants.FORMAT_BIGTABLE)
90-
mock_spark_session.dataframe.DataFrame.write.format().options. \
91-
assert_called_with(catalog='{key:value}')
92-
93-
@mock.patch.object(pyspark.sql, 'SparkSession')
94-
def test_run_csv2(self, mock_spark_session):
95-
"""Tests GCSToBigTableTemplate runs with csv format and some optional csv options"""
96-
97-
gcs_to_bigtable_template = GCSToBigTableTemplate()
98-
mock_parsed_args = gcs_to_bigtable_template.parse_args(
99-
["--gcs.bigtable.input.format=csv",
100-
"--gcs.bigtable.input.location=gs://test",
101-
"--gcs.bigtable.input.inferschema=false",
102-
"--gcs.bigtable.input.sep=|",
103-
"--gcs.bigtable.input.comment=#",
104-
"--gcs.bigtable.input.timestampntzformat=yyyy-MM-dd'T'HH:mm:ss",
105-
"--spark.bigtable.project.id=GCP_PROJECT",
106-
"--spark.bigtable.instance.id=BIGTABLE_INSTANCE_ID",
107-
"--gcs.bigtable.catalog.json={key:value}"])
108-
mock_spark_session.read.format().options().load.return_value = mock_spark_session.dataframe.DataFrame
109-
gcs_to_bigtable_template.run(mock_spark_session, mock_parsed_args)
110-
111-
mock_spark_session.read.format.assert_called_with(
112-
constants.FORMAT_CSV)
113-
mock_spark_session.read.format().options.assert_called_with(**{
114-
constants.CSV_HEADER: 'true',
115-
constants.CSV_INFER_SCHEMA: 'false',
116-
constants.CSV_SEP: "|",
117-
constants.CSV_COMMENT: "#",
118-
constants.CSV_TIMESTAMPNTZFORMAT: "yyyy-MM-dd'T'HH:mm:ss",
119-
})
120-
mock_spark_session.read.format().options().load.assert_called_once_with("gs://test")
121-
mock_spark_session.dataframe.DataFrame.write.format. \
122-
assert_called_once_with(constants.FORMAT_BIGTABLE)
123-
mock_spark_session.dataframe.DataFrame.write.format().options. \
124-
assert_called_with(catalog='{key:value}')
44+
assert parsed_args["gcs.bigtable.catalog.json"] == 'gs://dataproc-templates/conf/employeecatalog.json'

0 commit comments

Comments
 (0)