{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ " # download loom files via api \n", " \n", "### see api documentation\n", "[documentation](https://service.azul.data.humancellatlas.org/)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import requests\n", "import os\n", "from tqdm import tqdm\n", "import json\n", "import pandas as pd\n", "from tabulate import tabulate" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "# example function, downloading a single file which was specified before \n", "# modified! \n", "# TODO make this function work. Add file name to output_pat (see example code)\n", "\n", "def download_file(index, output_path, files_list):\n", " for i in index: \n", "\n", " url_tp = files_list[i]['url']\n", " url = url_tp.replace('/fetch', '') # Work around https://github.com/DataBiosphere/azul/issues/2908\n", " \n", " response = requests.get(url, stream=True)\n", " response.raise_for_status()\n", " \n", " total = int(response.headers.get('content-length', 0))\n", " print(f'Downloading to: {output_path}', flush=True)\n", " \n", " with open(output_path, 'wb') as f:\n", " with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:\n", " for chunk in response.iter_content(chunk_size=1024):\n", " size = f.write(chunk)\n", " bar.update(size)\n", "\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "project_uuid = '4a95101c-9ffc-4f30-a809-f04518a23803'\n", "catalog = 'dcp26'\n", "endpoint_url = f'https://service.azul.data.humancellatlas.org/index/projects/{project_uuid}'\n", "\n", "save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloads Complete.\n" ] } ], "source": [ "# example downoading of a project\n", "\n", "response = requests.get(endpoint_url, params={'catalog': catalog,})\n", "response.raise_for_status()\n", "response_json = response.json()\n", "project = response_json['projects'][0]\n", "\n", "file_urls = set()\n", "for key in ('matrices', 'contributedAnalyses'):\n", " tree = project[key]\n", " for path, file_info in iterate_matrices_tree(tree):\n", " url = file_info['url']\n", " if url not in file_urls:\n", " dest_path = os.path.join(save_location, file_info['name'])\n", " # TODO uncomment the folllowing line if you really want to download data: \n", " # download_file(url, dest_path)\n", " file_urls.add(url)\n", "print('Downloads Complete.')\n", "\n" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "# function gets meta data of loom files\n", "# TODO make default params and abiltiy to passing params to the function\n", "def get_files_metadata():\n", " params = {\n", " 'catalog': 'dcp26',\n", " 'filters': '{\"fileFormat\": {\"is\": [\"loom\",\"loom.gz\"]}}',\n", " 'size': 10,\n", " 'sort': 'lastModifiedDate',\n", " 'order': 'asc'\n", " }\n", " url = f'https://service.azul.data.humancellatlas.org/index/files'\n", " response = requests.get(url, params=params)\n", "\n", " # Check the response status code\n", " if response.status_code == 200:\n", " # Request was successful\n", " response_json = response.json()\n", " return response_json\n", " else:\n", " # An error occurred\n", " print(\"Error:\", response.status_code)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "\n", "def get_file_list(response_json):\n", " # extract file-information into a list of dicts\n", " file_data = [] #list\n", " for hit in response_json['hits']:\n", " for file in hit['files']:\n", " file_dict = {\n", " 'fileName':file['name'],\n", " 'size':file['size'],\n", " 'version':file['version'],\n", " 'projectShortname':hit['projects'][0]['projectShortname'][0],\n", " 'projectId':hit['projects'][0]['projectId'][0],\n", " 'entryId':hit['entryId'],\n", " 'contentDescription':file['contentDescription'][0],\n", " 'url':file['url']\n", " }\n", " file_data.append(file_dict)\n", " file_data_with_index = [{\n", " 'index': i,\n", " **file_dict\n", " } for i, file_dict in enumerate(file_data)]\n", " return file_data_with_index\n", "\n", "def print_file_table(response_json):\n", "\n", " #file_data = get_file_list(response_json)\n", " file_data = response_json\n", " \n", " # create nested list out of file_data\n", " headers = [\"Index\", \"File Name\" , \"Size\", \"Version\", \"Project (shortname)\", \"ProjectID\", \n", " \"EntryId\", \"Content Description\"]\n", " table_data = [[\n", " file['index'],\n", " file['fileName'],\n", " file['size'],\n", " file['version'],\n", " file['projectShortname'],\n", " file['projectId'],\n", " file['entryId'],\n", " file['contentDescription'],\n", " ] for file in file_data]\n", "\n", " table = tabulate(table_data, headers, tablefmt='fancy_grid')\n", " print(table)\n" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "╒═════════╤═══════════════════════════════════════════╤════════════╤═════════════════════════════╤════════════════════════════╤══════════════════════════════════════╤══════════════════════════════════════╤═══════════════════════╕\n", "│ Index │ File Name │ Size │ Version │ Project (shortname) │ ProjectID │ EntryId │ Content Description │\n", "╞═════════╪═══════════════════════════════════════════╪════════════╪═════════════════════════════╪════════════════════════════╪══════════════════════════════════════╪══════════════════════════════════════╪═══════════════════════╡\n", "│ 0 │ 098cc66a-d806-42db-a1c8-fa99a0317d7c.loom │ 854692581 │ 2021-02-03T19:43:20.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 131ea511-25f7-5801-993f-bfa25f8ca68d │ Count Matrix │\n", "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n", "│ 1 │ 294fe5d9-c1e8-4670-80d3-4c2b0a5e33c1.loom │ 1530326527 │ 2021-02-03T19:51:58.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 7848d80b-6b1d-56b5-b19a-9639e3c4efbe │ Count Matrix │\n", "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n", "│ 2 │ d6536459-ab4e-4954-a0ce-5e6d07670039.loom │ 938504115 │ 2021-02-03T19:44:43.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ b98cfaac-64f5-59f5-b42e-209186812c19 │ Count Matrix │\n", "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n", "│ 3 │ a040dae6-e0b1-49cf-a9ee-9793d5ad7d9c.loom │ 1478984890 │ 2021-02-03T19:49:15.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ c7b6470c-e2f0-5141-a8a2-11eb0984689a │ Count Matrix │\n", "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n", "│ 4 │ t-cell-activation-human-lung-10XV2.loom │ 395054566 │ 2021-02-10T18:04:33.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ d0b95f2c-98ae-582b-84f4-e2bd0c5a0adb │ Count Matrix │\n", "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n", "│ 5 │ 0f14c412-5014-4ac0-9a71-858b2f047777.loom │ 423142737 │ 2021-02-04T15:18:49.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 2b9f7c89-d1c2-53ef-a769-80fec2f7d9e6 │ Count Matrix │\n", "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n", "│ 6 │ 37cad11b-c8c9-4d1f-b715-498b0f8d4b35.loom │ 1066947865 │ 2021-02-04T15:49:34.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 32c69d68-1792-53af-9f42-0e97c9afc94b │ Count Matrix │\n", "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n", "│ 7 │ dc31f31d-ab56-4025-9834-99be638a2d50.loom │ 745509487 │ 2021-02-04T15:34:25.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 38a0ef48-9df1-5fef-8eb6-b32fbb67aabd │ Count Matrix │\n", "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n", "│ 8 │ afd0ea55-e710-4b46-bb05-2423e491b6f5.loom │ 698042665 │ 2021-02-04T15:34:36.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 3cd78fb7-b7bc-5ab1-b122-47585f0023d4 │ Count Matrix │\n", "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n", "│ 9 │ b3ce1085-08dc-42ff-a609-6968315327a8.loom │ 425012253 │ 2021-02-04T15:30:01.000000Z │ KidneySingleCellAtlas │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 44175006-91f3-5d95-9a08-b33e08ed1ae3 │ Count Matrix │\n", "╘═════════╧═══════════════════════════════════════════╧════════════╧═════════════════════════════╧════════════════════════════╧══════════════════════════════════════╧══════════════════════════════════════╧═══════════════════════╛\n", "Downloading to: /home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data\n" ] }, { "ename": "IsADirectoryError", "evalue": "[Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIsADirectoryError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[75], line 12\u001b[0m\n\u001b[1;32m 9\u001b[0m catalog \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mdcp26\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 10\u001b[0m save_location \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m---> 12\u001b[0m download_file(index, save_location,files_list)\n", "Cell \u001b[0;32mIn[68], line 16\u001b[0m, in \u001b[0;36mdownload_file\u001b[0;34m(index, output_path, files_list)\u001b[0m\n\u001b[1;32m 13\u001b[0m total \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(response\u001b[39m.\u001b[39mheaders\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mcontent-length\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m0\u001b[39m))\n\u001b[1;32m 14\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mDownloading to: \u001b[39m\u001b[39m{\u001b[39;00moutput_path\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m, flush\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m---> 16\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(output_path, \u001b[39m'\u001b[39;49m\u001b[39mwb\u001b[39;49m\u001b[39m'\u001b[39;49m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m 17\u001b[0m \u001b[39mwith\u001b[39;00m tqdm(total\u001b[39m=\u001b[39mtotal, unit\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mB\u001b[39m\u001b[39m'\u001b[39m, unit_scale\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, unit_divisor\u001b[39m=\u001b[39m\u001b[39m1024\u001b[39m) \u001b[39mas\u001b[39;00m bar:\n\u001b[1;32m 18\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m response\u001b[39m.\u001b[39miter_content(chunk_size\u001b[39m=\u001b[39m\u001b[39m1024\u001b[39m):\n", "File \u001b[0;32m~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[39mif\u001b[39;00m file \u001b[39min\u001b[39;00m {\u001b[39m0\u001b[39m, \u001b[39m1\u001b[39m, \u001b[39m2\u001b[39m}:\n\u001b[1;32m 278\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 279\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mIPython won\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt let you open fd=\u001b[39m\u001b[39m{\u001b[39;00mfile\u001b[39m}\u001b[39;00m\u001b[39m by default \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 280\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 281\u001b[0m \u001b[39m\"\u001b[39m\u001b[39myou can use builtins\u001b[39m\u001b[39m'\u001b[39m\u001b[39m open.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 282\u001b[0m )\n\u001b[0;32m--> 284\u001b[0m \u001b[39mreturn\u001b[39;00m io_open(file, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", "\u001b[0;31mIsADirectoryError\u001b[0m: [Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'" ] } ], "source": [ "# example workflow\n", "# get file metadata \n", "antwort = get_files_metadata()\n", "# transform & print it as list\n", "files_list = get_file_list(antwort)\n", "print_file_table(files_list)\n", "# specifiy which file(s) you want to download (provide index-list)\n", "index = [6]\n", "catalog = 'dcp26'\n", "save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'\n", "\n", "download_file(index, save_location,files_list)\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Weiterführende Notizen\n", "\n", "https://www.askpython.com/python/examples/pull-data-from-an-api" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Loom Dateien: \n", "\n", "Umgang mit LoomPy: \n", "\n", "https://linnarssonlab.org/loompy/apiwalkthrough/index.html" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }