Paul Kuehnel
--- a/API_access_local_backup.ipynb 0 → 100644

+ 240

− 0
+++ b/API_access_local_backup.ipynb 0 → 100644

+ 240

− 0
+%% Cell type:markdown id: tags:
+
+## Erste Spielereien
+
+### Geht das mit der API für uns ?
+probiere mal das Python package dass es gibt...
+https://pypi.org/project/hca/
+
+oke, fail: das wird nich mehr verwendet. schade
+
+%% Cell type:code id: tags:
+
+``` python
+import requests
+import os
+from tqdm import tqdm
+import json
+import pandas as pd
+from tabulate import tabulate
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# example function, downloading a single file which was specified before
+# modified!
+# TODO make this function work. Add file name to output_pat (see example code)
+
+def download_file(index, output_path, files_list):
+    for i in index:
+
+        url_tp = files_list[i]['url']
+        url = url_tp.replace('/fetch', '')  # Work around https://github.com/DataBiosphere/azul/issues/2908
+
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+
+        total = int(response.headers.get('content-length', 0))
+        print(f'Downloading to: {output_path}', flush=True)
+
+        with open(output_path, 'wb') as f:
+            with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:
+                for chunk in response.iter_content(chunk_size=1024):
+                    size = f.write(chunk)
+                    bar.update(size)
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+project_uuid = '4a95101c-9ffc-4f30-a809-f04518a23803'
+catalog = 'dcp26'
+endpoint_url = f'https://service.azul.data.humancellatlas.org/index/projects/{project_uuid}'
+
+save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# example downoading of a project
+
+response = requests.get(endpoint_url, params={'catalog': catalog,})
+response.raise_for_status()
+response_json = response.json()
+project = response_json['projects'][0]
+
+file_urls = set()
+for key in ('matrices', 'contributedAnalyses'):
+    tree = project[key]
+    for path, file_info in iterate_matrices_tree(tree):
+        url = file_info['url']
+        if url not in file_urls:
+            dest_path = os.path.join(save_location, file_info['name'])
+            # TODO uncomment the folllowing line if you really want to download data:
+            # download_file(url, dest_path)
+            file_urls.add(url)
+print('Downloads Complete.')
+
+```
+
+%% Output
+
+    Downloads Complete.
+
+%% Cell type:code id: tags:
+
+``` python
+# function gets meta data of loom files
+# TODO make default params and abiltiy to passing params to the function
+def get_files_metadata():
+    params = {
+        'catalog': 'dcp26',
+        'filters': '{"fileFormat": {"is": ["loom","loom.gz"]}}',
+        'size': 10,
+        'sort': 'lastModifiedDate',
+        'order': 'asc'
+    }
+    url = f'https://service.azul.data.humancellatlas.org/index/files'
+    response = requests.get(url, params=params)
+
+    # Check the response status code
+    if response.status_code == 200:
+        # Request was successful
+        response_json = response.json()
+        return response_json
+    else:
+        # An error occurred
+        print("Error:", response.status_code)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+
+def get_file_list(response_json):
+    # extract file-information into a list of dicts
+    file_data = []  #list
+    for hit in response_json['hits']:
+        for file in hit['files']:
+            file_dict = {
+                'fileName':file['name'],
+                'size':file['size'],
+                'version':file['version'],
+                'projectShortname':hit['projects'][0]['projectShortname'][0],
+                'projectId':hit['projects'][0]['projectId'][0],
+                'entryId':hit['entryId'],
+                'contentDescription':file['contentDescription'][0],
+                'url':file['url']
+            }
+            file_data.append(file_dict)
+    file_data_with_index = [{
+        'index': i,
+        **file_dict
+    } for i, file_dict in enumerate(file_data)]
+    return file_data_with_index
+
+def print_file_table(response_json):
+
+    #file_data = get_file_list(response_json)
+    file_data = response_json
+
+    # create nested list out of file_data
+    headers = ["Index", "File Name" , "Size", "Version", "Project (shortname)", "ProjectID",
+               "EntryId", "Content Description"]
+    table_data = [[
+        file['index'],
+        file['fileName'],
+        file['size'],
+        file['version'],
+        file['projectShortname'],
+        file['projectId'],
+        file['entryId'],
+        file['contentDescription'],
+    ] for file in file_data]
+
+    table = tabulate(table_data, headers, tablefmt='fancy_grid')
+    print(table)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# example workflow
+# get file metadata
+antwort = get_files_metadata()
+# transform & print it as list
+files_list = get_file_list(antwort)
+print_file_table(files_list)
+# specifiy which file(s) you want to download (provide index-list)
+index = [6]
+catalog = 'dcp26'
+save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
+
+download_file(index, save_location,files_list)
+```
+
+%% Output
+
+    ╒═════════╤═══════════════════════════════════════════╤════════════╤═════════════════════════════╤════════════════════════════╤══════════════════════════════════════╤══════════════════════════════════════╤═══════════════════════╕
+    │   Index │ File Name                                 │       Size │ Version                     │ Project (shortname)        │ ProjectID                            │ EntryId                              │ Content Description   │
+    ╞═════════╪═══════════════════════════════════════════╪════════════╪═════════════════════════════╪════════════════════════════╪══════════════════════════════════════╪══════════════════════════════════════╪═══════════════════════╡
+    │       0 │ 098cc66a-d806-42db-a1c8-fa99a0317d7c.loom │  854692581 │ 2021-02-03T19:43:20.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 131ea511-25f7-5801-993f-bfa25f8ca68d │ Count Matrix          │
+    ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
+    │       1 │ 294fe5d9-c1e8-4670-80d3-4c2b0a5e33c1.loom │ 1530326527 │ 2021-02-03T19:51:58.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 7848d80b-6b1d-56b5-b19a-9639e3c4efbe │ Count Matrix          │
+    ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
+    │       2 │ d6536459-ab4e-4954-a0ce-5e6d07670039.loom │  938504115 │ 2021-02-03T19:44:43.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ b98cfaac-64f5-59f5-b42e-209186812c19 │ Count Matrix          │
+    ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
+    │       3 │ a040dae6-e0b1-49cf-a9ee-9793d5ad7d9c.loom │ 1478984890 │ 2021-02-03T19:49:15.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ c7b6470c-e2f0-5141-a8a2-11eb0984689a │ Count Matrix          │
+    ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
+    │       4 │ t-cell-activation-human-lung-10XV2.loom   │  395054566 │ 2021-02-10T18:04:33.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ d0b95f2c-98ae-582b-84f4-e2bd0c5a0adb │ Count Matrix          │
+    ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
+    │       5 │ 0f14c412-5014-4ac0-9a71-858b2f047777.loom │  423142737 │ 2021-02-04T15:18:49.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 2b9f7c89-d1c2-53ef-a769-80fec2f7d9e6 │ Count Matrix          │
+    ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
+    │       6 │ 37cad11b-c8c9-4d1f-b715-498b0f8d4b35.loom │ 1066947865 │ 2021-02-04T15:49:34.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 32c69d68-1792-53af-9f42-0e97c9afc94b │ Count Matrix          │
+    ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
+    │       7 │ dc31f31d-ab56-4025-9834-99be638a2d50.loom │  745509487 │ 2021-02-04T15:34:25.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 38a0ef48-9df1-5fef-8eb6-b32fbb67aabd │ Count Matrix          │
+    ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
+    │       8 │ afd0ea55-e710-4b46-bb05-2423e491b6f5.loom │  698042665 │ 2021-02-04T15:34:36.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 3cd78fb7-b7bc-5ab1-b122-47585f0023d4 │ Count Matrix          │
+    ├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤
+    │       9 │ b3ce1085-08dc-42ff-a609-6968315327a8.loom │  425012253 │ 2021-02-04T15:30:01.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 44175006-91f3-5d95-9a08-b33e08ed1ae3 │ Count Matrix          │
+    ╘═════════╧═══════════════════════════════════════════╧════════════╧═════════════════════════════╧════════════════════════════╧══════════════════════════════════════╧══════════════════════════════════════╧═══════════════════════╛
+    Downloading to: /home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data
+
+    ---------------------------------------------------------------------------
+    IsADirectoryError                         Traceback (most recent call last)
+Cell     In[75], line 12
+          9 catalog = 'dcp26'
+         10 save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
+    ---> 12 download_file(index, save_location,files_list)
+Cell     In[68], line 16, in download_file(index, output_path, files_list)
+         13 total = int(response.headers.get('content-length', 0))
+         14 print(f'Downloading to: {output_path}', flush=True)
+    ---> 16 with open(output_path, 'wb') as f:
+         17     with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:
+         18         for chunk in response.iter_content(chunk_size=1024):
+File     ~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284, in _modified_open(file, *args, **kwargs)
+        277 if file in {0, 1, 2}:
+        278     raise ValueError(
+        279         f"IPython won't let you open fd={file} by default "
+        280         "as it is likely to crash IPython. If you know what you are doing, "
+        281         "you can use builtins' open."
+        282     )
+    --> 284 return io_open(file, *args, **kwargs)
+    IsADirectoryError: [Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'
+
+%% Cell type:markdown id: tags:
+
+## Weiterführende Notizen
+
+https://www.askpython.com/python/examples/pull-data-from-an-api
+
+%% Cell type:markdown id: tags:
+
+Loom Dateien:
+
+Umgang mit LoomPy:
+
+https://linnarssonlab.org/loompy/apiwalkthrough/index.html