Skip to content
Snippets Groups Projects

Dev

Merged Paul Kuehnel requested to merge dev into main
1 file
+ 240
0
Compare changes
  • Side-by-side
  • Inline
+ 240
0
%% Cell type:markdown id: tags:
## Erste Spielereien
### Geht das mit der API für uns ?
probiere mal das Python package dass es gibt...
https://pypi.org/project/hca/
oke, fail: das wird nich mehr verwendet. schade
%% Cell type:code id: tags:
``` python
import requests
import os
from tqdm import tqdm
import json
import pandas as pd
from tabulate import tabulate
```
%% Cell type:code id: tags:
``` python
# example function, downloading a single file which was specified before
# modified!
# TODO make this function work. Add file name to output_pat (see example code)
def download_file(index, output_path, files_list):
for i in index:
url_tp = files_list[i]['url']
url = url_tp.replace('/fetch', '') # Work around https://github.com/DataBiosphere/azul/issues/2908
response = requests.get(url, stream=True)
response.raise_for_status()
total = int(response.headers.get('content-length', 0))
print(f'Downloading to: {output_path}', flush=True)
with open(output_path, 'wb') as f:
with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:
for chunk in response.iter_content(chunk_size=1024):
size = f.write(chunk)
bar.update(size)
```
%% Cell type:code id: tags:
``` python
project_uuid = '4a95101c-9ffc-4f30-a809-f04518a23803'
catalog = 'dcp26'
endpoint_url = f'https://service.azul.data.humancellatlas.org/index/projects/{project_uuid}'
save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
```
%% Cell type:code id: tags:
``` python
# example downoading of a project
response = requests.get(endpoint_url, params={'catalog': catalog,})
response.raise_for_status()
response_json = response.json()
project = response_json['projects'][0]
file_urls = set()
for key in ('matrices', 'contributedAnalyses'):
tree = project[key]
for path, file_info in iterate_matrices_tree(tree):
url = file_info['url']
if url not in file_urls:
dest_path = os.path.join(save_location, file_info['name'])
# TODO uncomment the folllowing line if you really want to download data:
# download_file(url, dest_path)
file_urls.add(url)
print('Downloads Complete.')
```
%% Output
Downloads Complete.
%% Cell type:code id: tags:
``` python
# function gets meta data of loom files
# TODO make default params and abiltiy to passing params to the function
def get_files_metadata():
params = {
'catalog': 'dcp26',
'filters': '{"fileFormat": {"is": ["loom","loom.gz"]}}',
'size': 10,
'sort': 'lastModifiedDate',
'order': 'asc'
}
url = f'https://service.azul.data.humancellatlas.org/index/files'
response = requests.get(url, params=params)
# Check the response status code
if response.status_code == 200:
# Request was successful
response_json = response.json()
return response_json
else:
# An error occurred
print("Error:", response.status_code)
```
%% Cell type:code id: tags:
``` python
def get_file_list(response_json):
# extract file-information into a list of dicts
file_data = [] #list
for hit in response_json['hits']:
for file in hit['files']:
file_dict = {
'fileName':file['name'],
'size':file['size'],
'version':file['version'],
'projectShortname':hit['projects'][0]['projectShortname'][0],
'projectId':hit['projects'][0]['projectId'][0],
'entryId':hit['entryId'],
'contentDescription':file['contentDescription'][0],
'url':file['url']
}
file_data.append(file_dict)
file_data_with_index = [{
'index': i,
**file_dict
} for i, file_dict in enumerate(file_data)]
return file_data_with_index
def print_file_table(response_json):
#file_data = get_file_list(response_json)
file_data = response_json
# create nested list out of file_data
headers = ["Index", "File Name" , "Size", "Version", "Project (shortname)", "ProjectID",
"EntryId", "Content Description"]
table_data = [[
file['index'],
file['fileName'],
file['size'],
file['version'],
file['projectShortname'],
file['projectId'],
file['entryId'],
file['contentDescription'],
] for file in file_data]
table = tabulate(table_data, headers, tablefmt='fancy_grid')
print(table)
```
%% Cell type:code id: tags:
``` python
# example workflow
# get file metadata
antwort = get_files_metadata()
# transform & print it as list
files_list = get_file_list(antwort)
print_file_table(files_list)
# specifiy which file(s) you want to download (provide index-list)
index = [6]
catalog = 'dcp26'
save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
download_file(index, save_location,files_list)
```
%% Output
╒═════════╤═══════════════════════════════════════════╤════════════╤═════════════════════════════╤════════════════════════════╤══════════════════════════════════════╤══════════════════════════════════════╤═══════════════════════╕
│ Index │ File Name │ Size │ Version │ Project (shortname) │ ProjectID │ EntryId │ Content Description │
╞═════════╪═══════════════════════════════════════════╪════════════╪═════════════════════════════╪════════════════════════════╪══════════════════════════════════════╪══════════════════════════════════════╪═══════════════════════╡
│ 0 │ 098cc66a-d806-42db-a1c8-fa99a0317d7c.loom │ 854692581 │ 2021-02-03T19:43:20.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 131ea511-25f7-5801-993f-bfa25f8ca68d │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 1 │ 294fe5d9-c1e8-4670-80d3-4c2b0a5e33c1.loom │ 1530326527 │ 2021-02-03T19:51:58.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 7848d80b-6b1d-56b5-b19a-9639e3c4efbe │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 2 │ d6536459-ab4e-4954-a0ce-5e6d07670039.loom │ 938504115 │ 2021-02-03T19:44:43.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ b98cfaac-64f5-59f5-b42e-209186812c19 │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 3 │ a040dae6-e0b1-49cf-a9ee-9793d5ad7d9c.loom │ 1478984890 │ 2021-02-03T19:49:15.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ c7b6470c-e2f0-5141-a8a2-11eb0984689a │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 4 │ t-cell-activation-human-lung-10XV2.loom │ 395054566 │ 2021-02-10T18:04:33.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ d0b95f2c-98ae-582b-84f4-e2bd0c5a0adb │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 5 │ 0f14c412-5014-4ac0-9a71-858b2f047777.loom │ 423142737 │ 2021-02-04T15:18:49.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 2b9f7c89-d1c2-53ef-a769-80fec2f7d9e6 │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 6 │ 37cad11b-c8c9-4d1f-b715-498b0f8d4b35.loom │ 1066947865 │ 2021-02-04T15:49:34.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 32c69d68-1792-53af-9f42-0e97c9afc94b │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 7 │ dc31f31d-ab56-4025-9834-99be638a2d50.loom │ 745509487 │ 2021-02-04T15:34:25.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 38a0ef48-9df1-5fef-8eb6-b32fbb67aabd │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 8 │ afd0ea55-e710-4b46-bb05-2423e491b6f5.loom │ 698042665 │ 2021-02-04T15:34:36.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 3cd78fb7-b7bc-5ab1-b122-47585f0023d4 │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 9 │ b3ce1085-08dc-42ff-a609-6968315327a8.loom │ 425012253 │ 2021-02-04T15:30:01.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 44175006-91f3-5d95-9a08-b33e08ed1ae3 │ Count Matrix │
╘═════════╧═══════════════════════════════════════════╧════════════╧═════════════════════════════╧════════════════════════════╧══════════════════════════════════════╧══════════════════════════════════════╧═══════════════════════╛
Downloading to: /home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data
---------------------------------------------------------------------------
IsADirectoryError Traceback (most recent call last)
Cell In[75], line 12
9 catalog = 'dcp26'
10 save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
---> 12 download_file(index, save_location,files_list)
Cell In[68], line 16, in download_file(index, output_path, files_list)
13 total = int(response.headers.get('content-length', 0))
14 print(f'Downloading to: {output_path}', flush=True)
---> 16 with open(output_path, 'wb') as f:
17 with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:
18 for chunk in response.iter_content(chunk_size=1024):
File ~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284, in _modified_open(file, *args, **kwargs)
277 if file in {0, 1, 2}:
278 raise ValueError(
279 f"IPython won't let you open fd={file} by default "
280 "as it is likely to crash IPython. If you know what you are doing, "
281 "you can use builtins' open."
282 )
--> 284 return io_open(file, *args, **kwargs)
IsADirectoryError: [Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
%% Cell type:markdown id: tags:
## Weiterführende Notizen
https://www.askpython.com/python/examples/pull-data-from-an-api
%% Cell type:markdown id: tags:
Loom Dateien:
Umgang mit LoomPy:
https://linnarssonlab.org/loompy/apiwalkthrough/index.html
Loading