Skip to content

Commit 3f944df

Browse files
Feat: Onboard IDC PDP datasets (#230)
1 parent 27a2c8e commit 3f944df

39 files changed

+3754
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright 2021 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
FROM python:3.8
16+
ENV PYTHONUNBUFFERED True
17+
COPY requirements.txt ./
18+
RUN python3 -m pip install --no-cache-dir -r requirements.txt
19+
WORKDIR /custom
20+
COPY ./script.py .
21+
CMD ["python3", "script.py"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
google-api-core
2+
google-cloud-bigquery-datatransfer
3+
protobuf
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
# Copyright 2021 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
import logging
17+
import operator
18+
import os
19+
import time
20+
import typing
21+
22+
from google.api_core.exceptions import ResourceExhausted
23+
from google.cloud import bigquery_datatransfer_v1
24+
from google.protobuf.timestamp_pb2 import Timestamp
25+
26+
RETRY_DELAY = 10
27+
28+
29+
class TimeoutError(Exception):
30+
"""Raised when the BQ transfer jobs haven't all finished within the allotted time"""
31+
32+
pass
33+
34+
35+
def main(
36+
source_project_id: str,
37+
target_project_id: str,
38+
service_account: str,
39+
dataset_name: str,
40+
dataset_versions: typing.List[str],
41+
timeout: int,
42+
):
43+
client = bigquery_datatransfer_v1.DataTransferServiceClient()
44+
transfer_config_prefix = f"{dataset_name}-copy"
45+
transfer_configs = client.list_transfer_configs(
46+
request=bigquery_datatransfer_v1.types.ListTransferConfigsRequest(
47+
parent=f"projects/{target_project_id}"
48+
)
49+
)
50+
51+
existing_configs = [
52+
config
53+
for config in transfer_configs
54+
if config.display_name.startswith(transfer_config_prefix)
55+
]
56+
57+
_running_configs = []
58+
for version in dataset_versions:
59+
dataset_id = f"{dataset_name}_{version}"
60+
display_name = f"{transfer_config_prefix}-{version}"
61+
62+
_config = next(
63+
(
64+
config
65+
for config in existing_configs
66+
if config.display_name == display_name
67+
),
68+
None,
69+
)
70+
if not _config:
71+
_config = create_transfer_config(
72+
client,
73+
source_project_id,
74+
target_project_id,
75+
dataset_id,
76+
display_name,
77+
service_account,
78+
)
79+
80+
trigger_config(client, _config)
81+
_running_configs.append(_config)
82+
83+
wait_for_completion(client, _running_configs, timeout)
84+
85+
86+
def wait_for_completion(
87+
client: bigquery_datatransfer_v1.DataTransferServiceClient,
88+
running_configs: typing.List[bigquery_datatransfer_v1.types.TransferConfig],
89+
timeout: int,
90+
) -> None:
91+
_start = int(time.time())
92+
93+
while True:
94+
latest_runs = []
95+
for config in running_configs:
96+
latest_runs.append(latest_transfer_run(client, config))
97+
98+
logging.info(f"States: {[str(run.state) for run in latest_runs]}")
99+
100+
# Mark as complete when all runs have succeeded
101+
if all([str(run.state) == "TransferState.SUCCEEDED" for run in latest_runs]):
102+
return
103+
104+
# Stop the process when it's longer than the allotted time
105+
if int(time.time()) - _start > timeout:
106+
raise TimeoutError
107+
108+
time.sleep(RETRY_DELAY)
109+
110+
111+
def latest_transfer_run(
112+
client: bigquery_datatransfer_v1.DataTransferServiceClient,
113+
config: bigquery_datatransfer_v1.types.TransferConfig,
114+
) -> bigquery_datatransfer_v1.types.TransferRun:
115+
transfer_runs = client.list_transfer_runs(parent=config.name)
116+
return max(transfer_runs, key=operator.attrgetter("run_time"))
117+
118+
119+
def create_transfer_config(
120+
client: bigquery_datatransfer_v1.DataTransferServiceClient,
121+
source_project_id: str,
122+
target_project_id: str,
123+
dataset_id: str,
124+
display_name: str,
125+
service_account: str,
126+
) -> bigquery_datatransfer_v1.types.TransferConfig:
127+
transfer_config = bigquery_datatransfer_v1.TransferConfig(
128+
destination_dataset_id=dataset_id,
129+
display_name=display_name,
130+
data_source_id="cross_region_copy",
131+
dataset_region="US",
132+
params={
133+
"source_project_id": source_project_id,
134+
"source_dataset_id": dataset_id,
135+
},
136+
schedule_options=bigquery_datatransfer_v1.ScheduleOptions(
137+
disable_auto_scheduling=True
138+
),
139+
)
140+
141+
request = bigquery_datatransfer_v1.types.CreateTransferConfigRequest(
142+
parent=client.common_project_path(target_project_id),
143+
transfer_config=transfer_config,
144+
service_account_name=service_account,
145+
)
146+
147+
return client.create_transfer_config(request=request)
148+
149+
150+
def trigger_config(
151+
client: bigquery_datatransfer_v1.DataTransferServiceClient,
152+
config: bigquery_datatransfer_v1.types.TransferConfig,
153+
) -> None:
154+
now = time.time()
155+
seconds = int(now)
156+
nanos = int((now - seconds) * 10 ** 9)
157+
158+
try:
159+
client.start_manual_transfer_runs(
160+
request=bigquery_datatransfer_v1.types.StartManualTransferRunsRequest(
161+
parent=config.name,
162+
requested_run_time=Timestamp(seconds=seconds, nanos=nanos),
163+
)
164+
)
165+
except ResourceExhausted:
166+
logging.info(
167+
f"Transfer job is currently running for config ({config.display_name}) {config.name}."
168+
)
169+
return
170+
171+
172+
if __name__ == "__main__":
173+
logging.getLogger().setLevel(logging.INFO)
174+
175+
main(
176+
source_project_id=os.environ["SOURCE_PROJECT_ID"],
177+
target_project_id=os.environ["TARGET_PROJECT_ID"],
178+
service_account=os.environ["SERVICE_ACCOUNT"],
179+
dataset_name=os.environ["DATASET_NAME"],
180+
dataset_versions=json.loads(os.environ["DATASET_VERSIONS"]),
181+
timeout=int(os.getenv("TIMEOUT", 1200)),
182+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright 2021 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
FROM python:3.8
16+
ENV PYTHONUNBUFFERED True
17+
COPY requirements.txt ./
18+
RUN python3 -m pip install --no-cache-dir -r requirements.txt
19+
WORKDIR /custom
20+
COPY . .
21+
CMD ["python3", "script.py"]
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
WITH
2+
pre_dicom_all
3+
AS (
4+
SELECT
5+
aux.idc_webapp_collection_id AS collection_id,
6+
aux.gcs_url as gcs_url,
7+
aux.gcs_bucket as gcs_bucket,
8+
aux.study_uuid as crdc_study_uuid,
9+
aux.series_uuid as crdc_series_uuid,
10+
aux.instance_uuid as crdc_instance_uuid,
11+
aux.idc_case_id as idc_case_id,
12+
aux.instance_size as instance_size,
13+
aux.version_hash as version_hash,
14+
aux.collection_hash as collection_hash,
15+
aux.patient_hash as patient_hash,
16+
aux.study_hash as study_hash,
17+
aux.series_hash as series_hash,
18+
aux.instance_hash as instance_hash,
19+
aux.source_doi as Source_DOI,
20+
dcm.*
21+
FROM
22+
`PROJECT.DATASET.auxiliary_metadata` AS aux
23+
INNER JOIN
24+
`PROJECT.DATASET.dicom_metadata` AS dcm
25+
ON
26+
aux.SOPInstanceUID = dcm.SOPInstanceUID
27+
)
28+
29+
SELECT
30+
data_collections.Location AS tcia_tumorLocation,
31+
data_collections.Species AS tcia_species,
32+
data_collections.CancerType AS tcia_cancerType,
33+
pre_dicom_all.*
34+
FROM
35+
pre_dicom_all
36+
INNER JOIN
37+
`PROJECT.DATASET.original_collections_metadata` AS data_collections
38+
ON
39+
pre_dicom_all.collection_id = data_collections.idc_webapp_collection_id

0 commit comments

Comments
 (0)