Skip to content
Snippets Groups Projects
Commit 2abf64c8 authored by Franziska Roepke's avatar Franziska Roepke
Browse files

Replace API_access.ipynb

parent 2010b8ed
No related branches found
No related tags found
1 merge request!1Dev
This commit is part of merge request !1. Comments created here will be created in the context of that merge request.
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Erste Spielereien # download loom files via api
### Geht das mit der API für uns ? ### see api documentation
probiere mal das Python package dass es gibt... [documentation](https://service.azul.data.humancellatlas.org/)
https://pypi.org/project/hca/
oke, fail: das wird nich mehr verwendet. schade
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import requests import requests
import os import os
from tqdm import tqdm from tqdm import tqdm
import json import json
import pandas as pd import pandas as pd
from tabulate import tabulate from tabulate import tabulate
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# example function, downloading a single file which was specified before # example function, downloading a single file which was specified before
# modified! # modified!
# TODO make this function work. Add file name to output_pat (see example code) # TODO make this function work. Add file name to output_pat (see example code)
def download_file(index, output_path, files_list): def download_file(index, output_path, files_list):
for i in index: for i in index:
url_tp = files_list[i]['url'] url_tp = files_list[i]['url']
url = url_tp.replace('/fetch', '') # Work around https://github.com/DataBiosphere/azul/issues/2908 url = url_tp.replace('/fetch', '') # Work around https://github.com/DataBiosphere/azul/issues/2908
response = requests.get(url, stream=True) response = requests.get(url, stream=True)
response.raise_for_status() response.raise_for_status()
total = int(response.headers.get('content-length', 0)) total = int(response.headers.get('content-length', 0))
print(f'Downloading to: {output_path}', flush=True) print(f'Downloading to: {output_path}', flush=True)
with open(output_path, 'wb') as f: with open(output_path, 'wb') as f:
with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar: with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:
for chunk in response.iter_content(chunk_size=1024): for chunk in response.iter_content(chunk_size=1024):
size = f.write(chunk) size = f.write(chunk)
bar.update(size) bar.update(size)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
project_uuid = '4a95101c-9ffc-4f30-a809-f04518a23803' project_uuid = '4a95101c-9ffc-4f30-a809-f04518a23803'
catalog = 'dcp26' catalog = 'dcp26'
endpoint_url = f'https://service.azul.data.humancellatlas.org/index/projects/{project_uuid}' endpoint_url = f'https://service.azul.data.humancellatlas.org/index/projects/{project_uuid}'
save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data' save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# example downoading of a project # example downoading of a project
response = requests.get(endpoint_url, params={'catalog': catalog,}) response = requests.get(endpoint_url, params={'catalog': catalog,})
response.raise_for_status() response.raise_for_status()
response_json = response.json() response_json = response.json()
project = response_json['projects'][0] project = response_json['projects'][0]
file_urls = set() file_urls = set()
for key in ('matrices', 'contributedAnalyses'): for key in ('matrices', 'contributedAnalyses'):
tree = project[key] tree = project[key]
for path, file_info in iterate_matrices_tree(tree): for path, file_info in iterate_matrices_tree(tree):
url = file_info['url'] url = file_info['url']
if url not in file_urls: if url not in file_urls:
dest_path = os.path.join(save_location, file_info['name']) dest_path = os.path.join(save_location, file_info['name'])
# TODO uncomment the folllowing line if you really want to download data: # TODO uncomment the folllowing line if you really want to download data:
# download_file(url, dest_path) # download_file(url, dest_path)
file_urls.add(url) file_urls.add(url)
print('Downloads Complete.') print('Downloads Complete.')
``` ```
%% Output %% Output
Downloads Complete. Downloads Complete.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# function gets meta data of loom files # function gets meta data of loom files
# TODO make default params and abiltiy to passing params to the function # TODO make default params and abiltiy to passing params to the function
def get_files_metadata(): def get_files_metadata():
params = { params = {
'catalog': 'dcp26', 'catalog': 'dcp26',
'filters': '{"fileFormat": {"is": ["loom","loom.gz"]}}', 'filters': '{"fileFormat": {"is": ["loom","loom.gz"]}}',
'size': 10, 'size': 10,
'sort': 'lastModifiedDate', 'sort': 'lastModifiedDate',
'order': 'asc' 'order': 'asc'
} }
url = f'https://service.azul.data.humancellatlas.org/index/files' url = f'https://service.azul.data.humancellatlas.org/index/files'
response = requests.get(url, params=params) response = requests.get(url, params=params)
# Check the response status code # Check the response status code
if response.status_code == 200: if response.status_code == 200:
# Request was successful # Request was successful
response_json = response.json() response_json = response.json()
return response_json return response_json
else: else:
# An error occurred # An error occurred
print("Error:", response.status_code) print("Error:", response.status_code)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def get_file_list(response_json): def get_file_list(response_json):
# extract file-information into a list of dicts # extract file-information into a list of dicts
file_data = [] #list file_data = [] #list
for hit in response_json['hits']: for hit in response_json['hits']:
for file in hit['files']: for file in hit['files']:
file_dict = { file_dict = {
'fileName':file['name'], 'fileName':file['name'],
'size':file['size'], 'size':file['size'],
'version':file['version'], 'version':file['version'],
'projectShortname':hit['projects'][0]['projectShortname'][0], 'projectShortname':hit['projects'][0]['projectShortname'][0],
'projectId':hit['projects'][0]['projectId'][0], 'projectId':hit['projects'][0]['projectId'][0],
'entryId':hit['entryId'], 'entryId':hit['entryId'],
'contentDescription':file['contentDescription'][0], 'contentDescription':file['contentDescription'][0],
'url':file['url'] 'url':file['url']
} }
file_data.append(file_dict) file_data.append(file_dict)
file_data_with_index = [{ file_data_with_index = [{
'index': i, 'index': i,
**file_dict **file_dict
} for i, file_dict in enumerate(file_data)] } for i, file_dict in enumerate(file_data)]
return file_data_with_index return file_data_with_index
def print_file_table(response_json): def print_file_table(response_json):
#file_data = get_file_list(response_json) #file_data = get_file_list(response_json)
file_data = response_json file_data = response_json
# create nested list out of file_data # create nested list out of file_data
headers = ["Index", "File Name" , "Size", "Version", "Project (shortname)", "ProjectID", headers = ["Index", "File Name" , "Size", "Version", "Project (shortname)", "ProjectID",
"EntryId", "Content Description"] "EntryId", "Content Description"]
table_data = [[ table_data = [[
file['index'], file['index'],
file['fileName'], file['fileName'],
file['size'], file['size'],
file['version'], file['version'],
file['projectShortname'], file['projectShortname'],
file['projectId'], file['projectId'],
file['entryId'], file['entryId'],
file['contentDescription'], file['contentDescription'],
] for file in file_data] ] for file in file_data]
table = tabulate(table_data, headers, tablefmt='fancy_grid') table = tabulate(table_data, headers, tablefmt='fancy_grid')
print(table) print(table)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# example workflow # example workflow
# get file metadata # get file metadata
antwort = get_files_metadata() antwort = get_files_metadata()
# transform & print it as list # transform & print it as list
files_list = get_file_list(antwort) files_list = get_file_list(antwort)
print_file_table(files_list) print_file_table(files_list)
# specifiy which file(s) you want to download (provide index-list) # specifiy which file(s) you want to download (provide index-list)
index = [6] index = [6]
catalog = 'dcp26' catalog = 'dcp26'
save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data' save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
download_file(index, save_location,files_list) download_file(index, save_location,files_list)
``` ```
%% Output %% Output
╒═════════╤═══════════════════════════════════════════╤════════════╤═════════════════════════════╤════════════════════════════╤══════════════════════════════════════╤══════════════════════════════════════╤═══════════════════════╕ ╒═════════╤═══════════════════════════════════════════╤════════════╤═════════════════════════════╤════════════════════════════╤══════════════════════════════════════╤══════════════════════════════════════╤═══════════════════════╕
│ Index │ File Name │ Size │ Version │ Project (shortname) │ ProjectID │ EntryId │ Content Description │ │ Index │ File Name │ Size │ Version │ Project (shortname) │ ProjectID │ EntryId │ Content Description │
╞═════════╪═══════════════════════════════════════════╪════════════╪═════════════════════════════╪════════════════════════════╪══════════════════════════════════════╪══════════════════════════════════════╪═══════════════════════╡ ╞═════════╪═══════════════════════════════════════════╪════════════╪═════════════════════════════╪════════════════════════════╪══════════════════════════════════════╪══════════════════════════════════════╪═══════════════════════╡
│ 0 │ 098cc66a-d806-42db-a1c8-fa99a0317d7c.loom │ 854692581 │ 2021-02-03T19:43:20.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 131ea511-25f7-5801-993f-bfa25f8ca68d │ Count Matrix │ │ 0 │ 098cc66a-d806-42db-a1c8-fa99a0317d7c.loom │ 854692581 │ 2021-02-03T19:43:20.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 131ea511-25f7-5801-993f-bfa25f8ca68d │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤ ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 1 │ 294fe5d9-c1e8-4670-80d3-4c2b0a5e33c1.loom │ 1530326527 │ 2021-02-03T19:51:58.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 7848d80b-6b1d-56b5-b19a-9639e3c4efbe │ Count Matrix │ │ 1 │ 294fe5d9-c1e8-4670-80d3-4c2b0a5e33c1.loom │ 1530326527 │ 2021-02-03T19:51:58.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 7848d80b-6b1d-56b5-b19a-9639e3c4efbe │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤ ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 2 │ d6536459-ab4e-4954-a0ce-5e6d07670039.loom │ 938504115 │ 2021-02-03T19:44:43.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ b98cfaac-64f5-59f5-b42e-209186812c19 │ Count Matrix │ │ 2 │ d6536459-ab4e-4954-a0ce-5e6d07670039.loom │ 938504115 │ 2021-02-03T19:44:43.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ b98cfaac-64f5-59f5-b42e-209186812c19 │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤ ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 3 │ a040dae6-e0b1-49cf-a9ee-9793d5ad7d9c.loom │ 1478984890 │ 2021-02-03T19:49:15.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ c7b6470c-e2f0-5141-a8a2-11eb0984689a │ Count Matrix │ │ 3 │ a040dae6-e0b1-49cf-a9ee-9793d5ad7d9c.loom │ 1478984890 │ 2021-02-03T19:49:15.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ c7b6470c-e2f0-5141-a8a2-11eb0984689a │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤ ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 4 │ t-cell-activation-human-lung-10XV2.loom │ 395054566 │ 2021-02-10T18:04:33.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ d0b95f2c-98ae-582b-84f4-e2bd0c5a0adb │ Count Matrix │ │ 4 │ t-cell-activation-human-lung-10XV2.loom │ 395054566 │ 2021-02-10T18:04:33.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ d0b95f2c-98ae-582b-84f4-e2bd0c5a0adb │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤ ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 5 │ 0f14c412-5014-4ac0-9a71-858b2f047777.loom │ 423142737 │ 2021-02-04T15:18:49.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 2b9f7c89-d1c2-53ef-a769-80fec2f7d9e6 │ Count Matrix │ │ 5 │ 0f14c412-5014-4ac0-9a71-858b2f047777.loom │ 423142737 │ 2021-02-04T15:18:49.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 2b9f7c89-d1c2-53ef-a769-80fec2f7d9e6 │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤ ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 6 │ 37cad11b-c8c9-4d1f-b715-498b0f8d4b35.loom │ 1066947865 │ 2021-02-04T15:49:34.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 32c69d68-1792-53af-9f42-0e97c9afc94b │ Count Matrix │ │ 6 │ 37cad11b-c8c9-4d1f-b715-498b0f8d4b35.loom │ 1066947865 │ 2021-02-04T15:49:34.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 32c69d68-1792-53af-9f42-0e97c9afc94b │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤ ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 7 │ dc31f31d-ab56-4025-9834-99be638a2d50.loom │ 745509487 │ 2021-02-04T15:34:25.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 38a0ef48-9df1-5fef-8eb6-b32fbb67aabd │ Count Matrix │ │ 7 │ dc31f31d-ab56-4025-9834-99be638a2d50.loom │ 745509487 │ 2021-02-04T15:34:25.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 38a0ef48-9df1-5fef-8eb6-b32fbb67aabd │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤ ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 8 │ afd0ea55-e710-4b46-bb05-2423e491b6f5.loom │ 698042665 │ 2021-02-04T15:34:36.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 3cd78fb7-b7bc-5ab1-b122-47585f0023d4 │ Count Matrix │ │ 8 │ afd0ea55-e710-4b46-bb05-2423e491b6f5.loom │ 698042665 │ 2021-02-04T15:34:36.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 3cd78fb7-b7bc-5ab1-b122-47585f0023d4 │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤ ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 9 │ b3ce1085-08dc-42ff-a609-6968315327a8.loom │ 425012253 │ 2021-02-04T15:30:01.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 44175006-91f3-5d95-9a08-b33e08ed1ae3 │ Count Matrix │ │ 9 │ b3ce1085-08dc-42ff-a609-6968315327a8.loom │ 425012253 │ 2021-02-04T15:30:01.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 44175006-91f3-5d95-9a08-b33e08ed1ae3 │ Count Matrix │
╘═════════╧═══════════════════════════════════════════╧════════════╧═════════════════════════════╧════════════════════════════╧══════════════════════════════════════╧══════════════════════════════════════╧═══════════════════════╛ ╘═════════╧═══════════════════════════════════════════╧════════════╧═════════════════════════════╧════════════════════════════╧══════════════════════════════════════╧══════════════════════════════════════╧═══════════════════════╛
Downloading to: /home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data Downloading to: /home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data
--------------------------------------------------------------------------- ---------------------------------------------------------------------------
IsADirectoryError Traceback (most recent call last) IsADirectoryError Traceback (most recent call last)
Cell In[75], line 12 Cell In[75], line 12
9 catalog = 'dcp26' 9 catalog = 'dcp26'
10 save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data' 10 save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
---> 12 download_file(index, save_location,files_list) ---> 12 download_file(index, save_location,files_list)
Cell In[68], line 16, in download_file(index, output_path, files_list) Cell In[68], line 16, in download_file(index, output_path, files_list)
13 total = int(response.headers.get('content-length', 0)) 13 total = int(response.headers.get('content-length', 0))
14 print(f'Downloading to: {output_path}', flush=True) 14 print(f'Downloading to: {output_path}', flush=True)
---> 16 with open(output_path, 'wb') as f: ---> 16 with open(output_path, 'wb') as f:
17 with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar: 17 with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:
18 for chunk in response.iter_content(chunk_size=1024): 18 for chunk in response.iter_content(chunk_size=1024):
File ~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284, in _modified_open(file, *args, **kwargs) File ~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284, in _modified_open(file, *args, **kwargs)
277 if file in {0, 1, 2}: 277 if file in {0, 1, 2}:
278 raise ValueError( 278 raise ValueError(
279 f"IPython won't let you open fd={file} by default " 279 f"IPython won't let you open fd={file} by default "
280 "as it is likely to crash IPython. If you know what you are doing, " 280 "as it is likely to crash IPython. If you know what you are doing, "
281 "you can use builtins' open." 281 "you can use builtins' open."
282 ) 282 )
--> 284 return io_open(file, *args, **kwargs) --> 284 return io_open(file, *args, **kwargs)
IsADirectoryError: [Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data' IsADirectoryError: [Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Weiterführende Notizen ## Weiterführende Notizen
https://www.askpython.com/python/examples/pull-data-from-an-api https://www.askpython.com/python/examples/pull-data-from-an-api
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Loom Dateien: Loom Dateien:
Umgang mit LoomPy: Umgang mit LoomPy:
https://linnarssonlab.org/loompy/apiwalkthrough/index.html https://linnarssonlab.org/loompy/apiwalkthrough/index.html
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment