1
Google Cloud Storage上のJSONファイルをBigQueryデータセットに読み込む関数を記述しようとしていますが、まだPythonを使用してGoogle Cloud Storageからデータを読み込む - スキーマが指定されていないRuntimeError
import oauth2client
import uuid
import time
from google.cloud import bigquery as bq
# from oauth2client.client import GoogleCredentials
# Configuration
BILLING_PROJECT_ID = ---
DATASET_NAME = ---
TABLE_NAME = ---
BUCKET_NAME = ---
FILE = ---
SOURCE = 'gs://{}/{}'.format(BUCKET_NAME, FILE)
SCHEMA = [
bq.SchemaField('question_id', 'INTEGER'),
bq.SchemaField('accepted_answer', 'INTEGER'),
bq.SchemaField('answer_count', 'INTEGER')
]
# CREDENTIALS = GoogleCredentials.get_application_efault()
client = bq.Client(project=BILLING_PROJECT_ID)
# Dataset
# Check if the dataset exists
def create_datasets(name):
dataset = client.dataset(name)
try:
assert not dataset.exists()
dataset.create()
assert dataset.exists()
print("Dataset {} created".format(name))
except(AssertionError):
pass
def load_data_from_gcs(dataset_name, table_name, source, schema):
'''
Load Data from Google Cloud Storage
'''
dataset = client.dataset(dataset_name)
table = dataset.table(table_name)
table.schema = schema
job_name = str(uuid.uuid4())
job = client.load_table_from_storage(
job_name, table, source)
job.source_format = 'NEWLINE_DELIMITED_JSON'
job.begin()
wait_for_job(job)
print('Loaded {} rows into {}:{}.'.format(
job.output_rows, dataset_name, table_name))
def wait_for_job(job):
while True:
job.reload()
if job.state == 'DONE':
if job.error_result:
raise RuntimeError(job.errors)
return
time.sleep(1)
load_data_from_gcs(dataset_name=DATASET_NAME,
table_name=TABLE_NAME,
source=SOURCE,
schema=SCHEMA)