Skip to content
Snippets Groups Projects

Dev

Merged Paul Kuehnel requested to merge dev into main
1 file
+ 0
0
Compare changes
  • Side-by-side
  • Inline
+ 296
0
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
" # download loom files via api \n",
" \n",
"### see api documentation\n",
"[documentation](https://service.azul.data.humancellatlas.org/)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import os\n",
"from tqdm import tqdm\n",
"import json\n",
"import pandas as pd\n",
"from tabulate import tabulate"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"# example function, downloading a single file which was specified before \n",
"# modified! \n",
"# TODO make this function work. Add file name to output_pat (see example code)\n",
"\n",
"def download_file(index, output_path, files_list):\n",
" for i in index: \n",
"\n",
" url_tp = files_list[i]['url']\n",
" url = url_tp.replace('/fetch', '') # Work around https://github.com/DataBiosphere/azul/issues/2908\n",
" \n",
" response = requests.get(url, stream=True)\n",
" response.raise_for_status()\n",
" \n",
" total = int(response.headers.get('content-length', 0))\n",
" print(f'Downloading to: {output_path}', flush=True)\n",
" \n",
" with open(output_path, 'wb') as f:\n",
" with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:\n",
" for chunk in response.iter_content(chunk_size=1024):\n",
" size = f.write(chunk)\n",
" bar.update(size)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"project_uuid = '4a95101c-9ffc-4f30-a809-f04518a23803'\n",
"catalog = 'dcp26'\n",
"endpoint_url = f'https://service.azul.data.humancellatlas.org/index/projects/{project_uuid}'\n",
"\n",
"save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloads Complete.\n"
]
}
],
"source": [
"# example downoading of a project\n",
"\n",
"response = requests.get(endpoint_url, params={'catalog': catalog,})\n",
"response.raise_for_status()\n",
"response_json = response.json()\n",
"project = response_json['projects'][0]\n",
"\n",
"file_urls = set()\n",
"for key in ('matrices', 'contributedAnalyses'):\n",
" tree = project[key]\n",
" for path, file_info in iterate_matrices_tree(tree):\n",
" url = file_info['url']\n",
" if url not in file_urls:\n",
" dest_path = os.path.join(save_location, file_info['name'])\n",
" # TODO uncomment the folllowing line if you really want to download data: \n",
" # download_file(url, dest_path)\n",
" file_urls.add(url)\n",
"print('Downloads Complete.')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"# function gets meta data of loom files\n",
"# TODO make default params and abiltiy to passing params to the function\n",
"def get_files_metadata():\n",
" params = {\n",
" 'catalog': 'dcp26',\n",
" 'filters': '{\"fileFormat\": {\"is\": [\"loom\",\"loom.gz\"]}}',\n",
" 'size': 10,\n",
" 'sort': 'lastModifiedDate',\n",
" 'order': 'asc'\n",
" }\n",
" url = f'https://service.azul.data.humancellatlas.org/index/files'\n",
" response = requests.get(url, params=params)\n",
"\n",
" # Check the response status code\n",
" if response.status_code == 200:\n",
" # Request was successful\n",
" response_json = response.json()\n",
" return response_json\n",
" else:\n",
" # An error occurred\n",
" print(\"Error:\", response.status_code)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def get_file_list(response_json):\n",
" # extract file-information into a list of dicts\n",
" file_data = [] #list\n",
" for hit in response_json['hits']:\n",
" for file in hit['files']:\n",
" file_dict = {\n",
" 'fileName':file['name'],\n",
" 'size':file['size'],\n",
" 'version':file['version'],\n",
" 'projectShortname':hit['projects'][0]['projectShortname'][0],\n",
" 'projectId':hit['projects'][0]['projectId'][0],\n",
" 'entryId':hit['entryId'],\n",
" 'contentDescription':file['contentDescription'][0],\n",
" 'url':file['url']\n",
" }\n",
" file_data.append(file_dict)\n",
" file_data_with_index = [{\n",
" 'index': i,\n",
" **file_dict\n",
" } for i, file_dict in enumerate(file_data)]\n",
" return file_data_with_index\n",
"\n",
"def print_file_table(response_json):\n",
"\n",
" #file_data = get_file_list(response_json)\n",
" file_data = response_json\n",
" \n",
" # create nested list out of file_data\n",
" headers = [\"Index\", \"File Name\" , \"Size\", \"Version\", \"Project (shortname)\", \"ProjectID\", \n",
" \"EntryId\", \"Content Description\"]\n",
" table_data = [[\n",
" file['index'],\n",
" file['fileName'],\n",
" file['size'],\n",
" file['version'],\n",
" file['projectShortname'],\n",
" file['projectId'],\n",
" file['entryId'],\n",
" file['contentDescription'],\n",
" ] for file in file_data]\n",
"\n",
" table = tabulate(table_data, headers, tablefmt='fancy_grid')\n",
" print(table)\n"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"╒═════════╤═══════════════════════════════════════════╤════════════╤═════════════════════════════╤════════════════════════════╤══════════════════════════════════════╤══════════════════════════════════════╤═══════════════════════╕\n",
"│ Index │ File Name │ Size │ Version │ Project (shortname) │ ProjectID │ EntryId │ Content Description │\n",
"╞═════════╪═══════════════════════════════════════════╪════════════╪═════════════════════════════╪════════════════════════════╪══════════════════════════════════════╪══════════════════════════════════════╪═══════════════════════╡\n",
"│ 0 │ 098cc66a-d806-42db-a1c8-fa99a0317d7c.loom │ 854692581 │ 2021-02-03T19:43:20.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 131ea511-25f7-5801-993f-bfa25f8ca68d │ Count Matrix │\n",
"├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
"│ 1 │ 294fe5d9-c1e8-4670-80d3-4c2b0a5e33c1.loom │ 1530326527 │ 2021-02-03T19:51:58.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 7848d80b-6b1d-56b5-b19a-9639e3c4efbe │ Count Matrix │\n",
"├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
"│ 2 │ d6536459-ab4e-4954-a0ce-5e6d07670039.loom │ 938504115 │ 2021-02-03T19:44:43.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ b98cfaac-64f5-59f5-b42e-209186812c19 │ Count Matrix │\n",
"├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
"│ 3 │ a040dae6-e0b1-49cf-a9ee-9793d5ad7d9c.loom │ 1478984890 │ 2021-02-03T19:49:15.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ c7b6470c-e2f0-5141-a8a2-11eb0984689a │ Count Matrix │\n",
"├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
"│ 4 │ t-cell-activation-human-lung-10XV2.loom │ 395054566 │ 2021-02-10T18:04:33.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ d0b95f2c-98ae-582b-84f4-e2bd0c5a0adb │ Count Matrix │\n",
"├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
"│ 5 │ 0f14c412-5014-4ac0-9a71-858b2f047777.loom │ 423142737 │ 2021-02-04T15:18:49.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 2b9f7c89-d1c2-53ef-a769-80fec2f7d9e6 │ Count Matrix │\n",
"├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
"│ 6 │ 37cad11b-c8c9-4d1f-b715-498b0f8d4b35.loom │ 1066947865 │ 2021-02-04T15:49:34.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 32c69d68-1792-53af-9f42-0e97c9afc94b │ Count Matrix │\n",
"├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
"│ 7 │ dc31f31d-ab56-4025-9834-99be638a2d50.loom │ 745509487 │ 2021-02-04T15:34:25.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 38a0ef48-9df1-5fef-8eb6-b32fbb67aabd │ Count Matrix │\n",
"├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
"│ 8 │ afd0ea55-e710-4b46-bb05-2423e491b6f5.loom │ 698042665 │ 2021-02-04T15:34:36.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 3cd78fb7-b7bc-5ab1-b122-47585f0023d4 │ Count Matrix │\n",
"├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
"│ 9 │ b3ce1085-08dc-42ff-a609-6968315327a8.loom │ 425012253 │ 2021-02-04T15:30:01.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 44175006-91f3-5d95-9a08-b33e08ed1ae3 │ Count Matrix │\n",
"╘═════════╧═══════════════════════════════════════════╧════════════╧═════════════════════════════╧════════════════════════════╧══════════════════════════════════════╧══════════════════════════════════════╧═══════════════════════╛\n",
"Downloading to: /home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data\n"
]
},
{
"ename": "IsADirectoryError",
"evalue": "[Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIsADirectoryError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[75], line 12\u001b[0m\n\u001b[1;32m 9\u001b[0m catalog \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mdcp26\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 10\u001b[0m save_location \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m---> 12\u001b[0m download_file(index, save_location,files_list)\n",
"Cell \u001b[0;32mIn[68], line 16\u001b[0m, in \u001b[0;36mdownload_file\u001b[0;34m(index, output_path, files_list)\u001b[0m\n\u001b[1;32m 13\u001b[0m total \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(response\u001b[39m.\u001b[39mheaders\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mcontent-length\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m0\u001b[39m))\n\u001b[1;32m 14\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mDownloading to: \u001b[39m\u001b[39m{\u001b[39;00moutput_path\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m, flush\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m---> 16\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(output_path, \u001b[39m'\u001b[39;49m\u001b[39mwb\u001b[39;49m\u001b[39m'\u001b[39;49m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m 17\u001b[0m \u001b[39mwith\u001b[39;00m tqdm(total\u001b[39m=\u001b[39mtotal, unit\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mB\u001b[39m\u001b[39m'\u001b[39m, unit_scale\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, unit_divisor\u001b[39m=\u001b[39m\u001b[39m1024\u001b[39m) \u001b[39mas\u001b[39;00m bar:\n\u001b[1;32m 18\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m response\u001b[39m.\u001b[39miter_content(chunk_size\u001b[39m=\u001b[39m\u001b[39m1024\u001b[39m):\n",
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[39mif\u001b[39;00m file \u001b[39min\u001b[39;00m {\u001b[39m0\u001b[39m, \u001b[39m1\u001b[39m, \u001b[39m2\u001b[39m}:\n\u001b[1;32m 278\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 279\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mIPython won\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt let you open fd=\u001b[39m\u001b[39m{\u001b[39;00mfile\u001b[39m}\u001b[39;00m\u001b[39m by default \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 280\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 281\u001b[0m \u001b[39m\"\u001b[39m\u001b[39myou can use builtins\u001b[39m\u001b[39m'\u001b[39m\u001b[39m open.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 282\u001b[0m )\n\u001b[0;32m--> 284\u001b[0m \u001b[39mreturn\u001b[39;00m io_open(file, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
"\u001b[0;31mIsADirectoryError\u001b[0m: [Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'"
]
}
],
"source": [
"# example workflow\n",
"# get file metadata \n",
"antwort = get_files_metadata()\n",
"# transform & print it as list\n",
"files_list = get_file_list(antwort)\n",
"print_file_table(files_list)\n",
"# specifiy which file(s) you want to download (provide index-list)\n",
"index = [6]\n",
"catalog = 'dcp26'\n",
"save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'\n",
"\n",
"download_file(index, save_location,files_list)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Weiterführende Notizen\n",
"\n",
"https://www.askpython.com/python/examples/pull-data-from-an-api"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Loom Dateien: \n",
"\n",
"Umgang mit LoomPy: \n",
"\n",
"https://linnarssonlab.org/loompy/apiwalkthrough/index.html"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading