Skip to content
Snippets Groups Projects
Commit 33b40bc9 authored by Franziska Roepke's avatar Franziska Roepke
Browse files

file in wrong directory

parent c8140241
No related branches found
No related tags found
1 merge request!1Dev
This commit is part of merge request !1. Comments created here will be created in the context of that merge request.
%% Cell type:markdown id: tags:
## Erste Spielereien
### Geht das mit der API für uns ?
probiere mal das Python package dass es gibt...
https://pypi.org/project/hca/
oke, fail: das wird nich mehr verwendet. schade
%% Cell type:code id: tags:
``` python
import requests
import os
from tqdm import tqdm
import json
import pandas as pd
from tabulate import tabulate
```
%% Cell type:code id: tags:
``` python
# example function, downloading a single file which was specified before
# modified!
# TODO make this function work. Add file name to output_pat (see example code)
def download_file(index, output_path, files_list):
for i in index:
url_tp = files_list[i]['url']
url = url_tp.replace('/fetch', '') # Work around https://github.com/DataBiosphere/azul/issues/2908
response = requests.get(url, stream=True)
response.raise_for_status()
total = int(response.headers.get('content-length', 0))
print(f'Downloading to: {output_path}', flush=True)
with open(output_path, 'wb') as f:
with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:
for chunk in response.iter_content(chunk_size=1024):
size = f.write(chunk)
bar.update(size)
```
%% Cell type:code id: tags:
``` python
project_uuid = '4a95101c-9ffc-4f30-a809-f04518a23803'
catalog = 'dcp26'
endpoint_url = f'https://service.azul.data.humancellatlas.org/index/projects/{project_uuid}'
save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
```
%% Cell type:code id: tags:
``` python
# example downoading of a project
response = requests.get(endpoint_url, params={'catalog': catalog,})
response.raise_for_status()
response_json = response.json()
project = response_json['projects'][0]
file_urls = set()
for key in ('matrices', 'contributedAnalyses'):
tree = project[key]
for path, file_info in iterate_matrices_tree(tree):
url = file_info['url']
if url not in file_urls:
dest_path = os.path.join(save_location, file_info['name'])
# TODO uncomment the folllowing line if you really want to download data:
# download_file(url, dest_path)
file_urls.add(url)
print('Downloads Complete.')
```
%% Output
Downloads Complete.
%% Cell type:code id: tags:
``` python
# function gets meta data of loom files
# TODO make default params and abiltiy to passing params to the function
def get_files_metadata():
params = {
'catalog': 'dcp26',
'filters': '{"fileFormat": {"is": ["loom","loom.gz"]}}',
'size': 10,
'sort': 'lastModifiedDate',
'order': 'asc'
}
url = f'https://service.azul.data.humancellatlas.org/index/files'
response = requests.get(url, params=params)
# Check the response status code
if response.status_code == 200:
# Request was successful
response_json = response.json()
return response_json
else:
# An error occurred
print("Error:", response.status_code)
```
%% Cell type:code id: tags:
``` python
def get_file_list(response_json):
# extract file-information into a list of dicts
file_data = [] #list
for hit in response_json['hits']:
for file in hit['files']:
file_dict = {
'fileName':file['name'],
'size':file['size'],
'version':file['version'],
'projectShortname':hit['projects'][0]['projectShortname'][0],
'projectId':hit['projects'][0]['projectId'][0],
'entryId':hit['entryId'],
'contentDescription':file['contentDescription'][0],
'url':file['url']
}
file_data.append(file_dict)
file_data_with_index = [{
'index': i,
**file_dict
} for i, file_dict in enumerate(file_data)]
return file_data_with_index
def print_file_table(response_json):
#file_data = get_file_list(response_json)
file_data = response_json
# create nested list out of file_data
headers = ["Index", "File Name" , "Size", "Version", "Project (shortname)", "ProjectID",
"EntryId", "Content Description"]
table_data = [[
file['index'],
file['fileName'],
file['size'],
file['version'],
file['projectShortname'],
file['projectId'],
file['entryId'],
file['contentDescription'],
] for file in file_data]
table = tabulate(table_data, headers, tablefmt='fancy_grid')
print(table)
```
%% Cell type:code id: tags:
``` python
# example workflow
# get file metadata
antwort = get_files_metadata()
# transform & print it as list
files_list = get_file_list(antwort)
print_file_table(files_list)
# specifiy which file(s) you want to download (provide index-list)
index = [6]
catalog = 'dcp26'
save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
download_file(index, save_location,files_list)
```
%% Output
╒═════════╤═══════════════════════════════════════════╤════════════╤═════════════════════════════╤════════════════════════════╤══════════════════════════════════════╤══════════════════════════════════════╤═══════════════════════╕
│ Index │ File Name │ Size │ Version │ Project (shortname) │ ProjectID │ EntryId │ Content Description │
╞═════════╪═══════════════════════════════════════════╪════════════╪═════════════════════════════╪════════════════════════════╪══════════════════════════════════════╪══════════════════════════════════════╪═══════════════════════╡
│ 0 │ 098cc66a-d806-42db-a1c8-fa99a0317d7c.loom │ 854692581 │ 2021-02-03T19:43:20.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 131ea511-25f7-5801-993f-bfa25f8ca68d │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 1 │ 294fe5d9-c1e8-4670-80d3-4c2b0a5e33c1.loom │ 1530326527 │ 2021-02-03T19:51:58.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 7848d80b-6b1d-56b5-b19a-9639e3c4efbe │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 2 │ d6536459-ab4e-4954-a0ce-5e6d07670039.loom │ 938504115 │ 2021-02-03T19:44:43.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ b98cfaac-64f5-59f5-b42e-209186812c19 │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 3 │ a040dae6-e0b1-49cf-a9ee-9793d5ad7d9c.loom │ 1478984890 │ 2021-02-03T19:49:15.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ c7b6470c-e2f0-5141-a8a2-11eb0984689a │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 4 │ t-cell-activation-human-lung-10XV2.loom │ 395054566 │ 2021-02-10T18:04:33.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ d0b95f2c-98ae-582b-84f4-e2bd0c5a0adb │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 5 │ 0f14c412-5014-4ac0-9a71-858b2f047777.loom │ 423142737 │ 2021-02-04T15:18:49.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 2b9f7c89-d1c2-53ef-a769-80fec2f7d9e6 │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 6 │ 37cad11b-c8c9-4d1f-b715-498b0f8d4b35.loom │ 1066947865 │ 2021-02-04T15:49:34.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 32c69d68-1792-53af-9f42-0e97c9afc94b │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 7 │ dc31f31d-ab56-4025-9834-99be638a2d50.loom │ 745509487 │ 2021-02-04T15:34:25.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 38a0ef48-9df1-5fef-8eb6-b32fbb67aabd │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 8 │ afd0ea55-e710-4b46-bb05-2423e491b6f5.loom │ 698042665 │ 2021-02-04T15:34:36.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 3cd78fb7-b7bc-5ab1-b122-47585f0023d4 │ Count Matrix │
├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
│ 9 │ b3ce1085-08dc-42ff-a609-6968315327a8.loom │ 425012253 │ 2021-02-04T15:30:01.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 44175006-91f3-5d95-9a08-b33e08ed1ae3 │ Count Matrix │
╘═════════╧═══════════════════════════════════════════╧════════════╧═════════════════════════════╧════════════════════════════╧══════════════════════════════════════╧══════════════════════════════════════╧═══════════════════════╛
Downloading to: /home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data
---------------------------------------------------------------------------
IsADirectoryError Traceback (most recent call last)
Cell In[75], line 12
9 catalog = 'dcp26'
10 save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
---> 12 download_file(index, save_location,files_list)
Cell In[68], line 16, in download_file(index, output_path, files_list)
13 total = int(response.headers.get('content-length', 0))
14 print(f'Downloading to: {output_path}', flush=True)
---> 16 with open(output_path, 'wb') as f:
17 with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:
18 for chunk in response.iter_content(chunk_size=1024):
File ~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284, in _modified_open(file, *args, **kwargs)
277 if file in {0, 1, 2}:
278 raise ValueError(
279 f"IPython won't let you open fd={file} by default "
280 "as it is likely to crash IPython. If you know what you are doing, "
281 "you can use builtins' open."
282 )
--> 284 return io_open(file, *args, **kwargs)
IsADirectoryError: [Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
%% Cell type:markdown id: tags:
## Weiterführende Notizen
https://www.askpython.com/python/examples/pull-data-from-an-api
%% Cell type:markdown id: tags:
Loom Dateien:
Umgang mit LoomPy:
https://linnarssonlab.org/loompy/apiwalkthrough/index.html
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment