【发布时间】:2020-12-02 13:41:29
【问题描述】:
使用“FILE_LOADS”技术通过 Apache Beam Dataflow 作业写入 BigQuery 时遇到错误。 Streaming INSERT(else 块)工作正常,如预期的那样。 'FILE_LOAD'(如果块)失败,代码后面给出了下面的错误。 GCS 存储桶上的临时文件是有效的 JSON 对象。
来自 Pub/Sub 的原始事件示例:
"{'event': 'test', 'entityId': 13615316690, 'eventTime': '2020-08-12T15:56:07.130899+00:00', 'targetEntityId': 8947793, 'targetEntityType': 'item', 'entityType': 'guest', 'properties': {}}"
"{'event': 'test', 'entityId': 13615316690, 'eventTime': '2020-08-12T15:56:07.130899+00:00', 'targetEntityId': 8947793, 'targetEntityType': 'item', 'entityType': 'guest', 'properties': {‘action’: ‘delete’}}"
from __future__ import absolute_import
import logging
import sys
import traceback
import argparse
import ast
import json
import datetime
import dateutil.parser as date_parser
import apache_beam as beam
import apache_beam.pvalue as pvalue
from google.cloud.bigquery import CreateDisposition, WriteDisposition
from apache_beam.io.gcp.bigquery_tools import RetryStrategy
def get_values(element):
# convert properties from dict to arr of dicts to form a repeatable bq table record
prop_list = [{'property_name': k, 'property_value': v} for k, v in element['properties'].items()]
date_parsed = date_parser.parse(element.get('eventTime'))
event_time = date_parsed.strftime('%Y-%m-%d %H:%M:00')
raw_value = {'event': element.get('event'),
'entity_type': element.get('entityType'),
'entity_id': element.get('entityId'),
'target_entity_type': element.get('targetEntityType'),
'target_entity_id': element.get('targetEntityId'),
'event_time': event_time,
'properties': prop_list
}
return raw_value
def stream_to_bq(c: dict):
argv = [
f'--project={c["PROJECT"]}',
f'--runner=DataflowRunner',
f'--job_name={c["JOBNAME"]}',
f'--save_main_session',
f'--staging_location=gs://{c["BUCKET_NAME"]}/{c["STAGING_LOCATION"]}',
f'--temp_location=gs://{c["BUCKET_NAME"]}/{c["TEMP_LOCATION"]}',
f'--network={c["NETWORKPATH"]}',
f'--subnetwork={c["SUBNETWORKPATH"]}',
f'--region={c["REGION"]}',
f'--service_account_email={c["SERVICE_ACCOUNT"]}',
# f'--setup_file=./setup.py',
# f'--autoscaling_algorithm=THROUGHPUT_BASED',
# f'--maxWorkers=15',
# f'--experiments=shuffle_mode=service',
'--no_use_public_ips',
f'--streaming'
]
if c['FILE_LOAD']:
argv.append('--experiments=allow_non_updatable_job')
argv.append('--experiments=use_beam_bq_sink')
p = beam.Pipeline(argv=argv)
valid_msgs = (p
| 'Read from Pubsub' >>
beam.io.ReadFromPubSub(subscription=c['SUBSCRIPTION']).with_output_types(bytes)
)
records = (valid_msgs
| 'Event Parser(BQ Row) ' >> beam.Map(get_values)
)
# Load data to BigQuery using - 'Load Jobs' or 'Streaming Insert', choice based on latency expectation.
if c['FILE_LOAD']:
records | 'Write Result to BQ' >> beam.io.WriteToBigQuery(c["RAW_TABLE"],
project=c["PROJECT"],
dataset=c["DATASET_NAME"],
method='FILE_LOADS',
triggering_frequency=c['FILE_LOAD_FREQUENCY'],
create_disposition=CreateDisposition.CREATE_NEVER,
write_disposition=WriteDisposition.WRITE_APPEND
)
else:
records | 'Write Result to BQ' >> beam.io.WriteToBigQuery(c["RAW_TABLE"],
project=c["PROJECT"],
dataset=c["DATASET_NAME"],
create_disposition=CreateDisposition.CREATE_NEVER,
write_disposition=WriteDisposition.WRITE_APPEND,
insert_retry_strategy=RetryStrategy.RETRY_ON_TRANSIENT_ERROR
)
p.run()
来自数据流作业的错误:
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details.' reason: 'invalid'> [while running 'generatedPtransform-1801'] java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357) java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1895) org.apache.beam.sdk.util.MoreFutures.get(MoreFutures.java:57)
【问题讨论】:
-
进展如何?你找到问题了吗?
标签: python google-bigquery google-cloud-dataflow apache-beam