Paul Kuehnel · Paul Kuehnel
--- a/get loom via api/API_access.ipynb 0 → 100644

+ 296

− 0
+++ b/get loom via api/API_access.ipynb 0 → 100644

+ 296

− 0
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " # download loom files via api \n",
+    " \n",
+    "### see api documentation\n",
+    "[documentation](https://service.azul.data.humancellatlas.org/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import os\n",
+    "from tqdm import tqdm\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "from tabulate import tabulate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# example function, downloading a single file which was specified before \n",
+    "# modified! \n",
+    "# TODO make this function work. Add file name to output_pat (see example code)\n",
+    "\n",
+    "def download_file(index, output_path, files_list):\n",
+    "    for i in index: \n",
+    "\n",
+    "        url_tp = files_list[i]['url']\n",
+    "        url = url_tp.replace('/fetch', '')  # Work around https://github.com/DataBiosphere/azul/issues/2908\n",
+    "    \n",
+    "        response = requests.get(url, stream=True)\n",
+    "        response.raise_for_status()\n",
+    "    \n",
+    "        total = int(response.headers.get('content-length', 0))\n",
+    "        print(f'Downloading to: {output_path}', flush=True)\n",
+    "    \n",
+    "        with open(output_path, 'wb') as f:\n",
+    "            with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:\n",
+    "                for chunk in response.iter_content(chunk_size=1024):\n",
+    "                    size = f.write(chunk)\n",
+    "                    bar.update(size)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project_uuid = '4a95101c-9ffc-4f30-a809-f04518a23803'\n",
+    "catalog = 'dcp26'\n",
+    "endpoint_url = f'https://service.azul.data.humancellatlas.org/index/projects/{project_uuid}'\n",
+    "\n",
+    "save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloads Complete.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# example downoading of a project\n",
+    "\n",
+    "response = requests.get(endpoint_url, params={'catalog': catalog,})\n",
+    "response.raise_for_status()\n",
+    "response_json = response.json()\n",
+    "project = response_json['projects'][0]\n",
+    "\n",
+    "file_urls = set()\n",
+    "for key in ('matrices', 'contributedAnalyses'):\n",
+    "    tree = project[key]\n",
+    "    for path, file_info in iterate_matrices_tree(tree):\n",
+    "        url = file_info['url']\n",
+    "        if url not in file_urls:\n",
+    "            dest_path = os.path.join(save_location, file_info['name'])\n",
+    "            # TODO uncomment the folllowing line if you really want to download data: \n",
+    "            # download_file(url, dest_path)\n",
+    "            file_urls.add(url)\n",
+    "print('Downloads Complete.')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# function gets meta data of loom files\n",
+    "# TODO make default params and abiltiy to passing params to the function\n",
+    "def get_files_metadata():\n",
+    "    params = {\n",
+    "        'catalog': 'dcp26',\n",
+    "        'filters': '{\"fileFormat\": {\"is\": [\"loom\",\"loom.gz\"]}}',\n",
+    "        'size': 10,\n",
+    "        'sort': 'lastModifiedDate',\n",
+    "        'order': 'asc'\n",
+    "    }\n",
+    "    url = f'https://service.azul.data.humancellatlas.org/index/files'\n",
+    "    response = requests.get(url, params=params)\n",
+    "\n",
+    "    # Check the response status code\n",
+    "    if response.status_code == 200:\n",
+    "        # Request was successful\n",
+    "        response_json = response.json()\n",
+    "        return response_json\n",
+    "    else:\n",
+    "        # An error occurred\n",
+    "        print(\"Error:\", response.status_code)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def get_file_list(response_json):\n",
+    "    # extract file-information into a list of dicts\n",
+    "    file_data = []  #list\n",
+    "    for hit in response_json['hits']:\n",
+    "        for file in hit['files']:\n",
+    "            file_dict = {\n",
+    "                'fileName':file['name'],\n",
+    "                'size':file['size'],\n",
+    "                'version':file['version'],\n",
+    "                'projectShortname':hit['projects'][0]['projectShortname'][0],\n",
+    "                'projectId':hit['projects'][0]['projectId'][0],\n",
+    "                'entryId':hit['entryId'],\n",
+    "                'contentDescription':file['contentDescription'][0],\n",
+    "                'url':file['url']\n",
+    "            }\n",
+    "            file_data.append(file_dict)\n",
+    "    file_data_with_index = [{\n",
+    "        'index': i,\n",
+    "        **file_dict\n",
+    "    } for i, file_dict in enumerate(file_data)]\n",
+    "    return file_data_with_index\n",
+    "\n",
+    "def print_file_table(response_json):\n",
+    "\n",
+    "    #file_data = get_file_list(response_json)\n",
+    "    file_data = response_json\n",
+    "    \n",
+    "    # create nested list out of file_data\n",
+    "    headers = [\"Index\", \"File Name\" , \"Size\", \"Version\", \"Project (shortname)\", \"ProjectID\", \n",
+    "               \"EntryId\", \"Content Description\"]\n",
+    "    table_data = [[\n",
+    "        file['index'],\n",
+    "        file['fileName'],\n",
+    "        file['size'],\n",
+    "        file['version'],\n",
+    "        file['projectShortname'],\n",
+    "        file['projectId'],\n",
+    "        file['entryId'],\n",
+    "        file['contentDescription'],\n",
+    "    ] for file in file_data]\n",
+    "\n",
+    "    table = tabulate(table_data, headers, tablefmt='fancy_grid')\n",
+    "    print(table)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "╒═════════╤═══════════════════════════════════════════╤════════════╤═════════════════════════════╤════════════════════════════╤══════════════════════════════════════╤══════════════════════════════════════╤═══════════════════════╕\n",
+      "│   Index │ File Name                                 │       Size │ Version                     │ Project (shortname)        │ ProjectID                            │ EntryId                              │ Content Description   │\n",
+      "╞═════════╪═══════════════════════════════════════════╪════════════╪═════════════════════════════╪════════════════════════════╪══════════════════════════════════════╪══════════════════════════════════════╪═══════════════════════╡\n",
+      "│       0 │ 098cc66a-d806-42db-a1c8-fa99a0317d7c.loom │  854692581 │ 2021-02-03T19:43:20.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 131ea511-25f7-5801-993f-bfa25f8ca68d │ Count Matrix          │\n",
+      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
+      "│       1 │ 294fe5d9-c1e8-4670-80d3-4c2b0a5e33c1.loom │ 1530326527 │ 2021-02-03T19:51:58.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 7848d80b-6b1d-56b5-b19a-9639e3c4efbe │ Count Matrix          │\n",
+      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
+      "│       2 │ d6536459-ab4e-4954-a0ce-5e6d07670039.loom │  938504115 │ 2021-02-03T19:44:43.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ b98cfaac-64f5-59f5-b42e-209186812c19 │ Count Matrix          │\n",
+      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
+      "│       3 │ a040dae6-e0b1-49cf-a9ee-9793d5ad7d9c.loom │ 1478984890 │ 2021-02-03T19:49:15.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ c7b6470c-e2f0-5141-a8a2-11eb0984689a │ Count Matrix          │\n",
+      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
+      "│       4 │ t-cell-activation-human-lung-10XV2.loom   │  395054566 │ 2021-02-10T18:04:33.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ d0b95f2c-98ae-582b-84f4-e2bd0c5a0adb │ Count Matrix          │\n",
+      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
+      "│       5 │ 0f14c412-5014-4ac0-9a71-858b2f047777.loom │  423142737 │ 2021-02-04T15:18:49.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 2b9f7c89-d1c2-53ef-a769-80fec2f7d9e6 │ Count Matrix          │\n",
+      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
+      "│       6 │ 37cad11b-c8c9-4d1f-b715-498b0f8d4b35.loom │ 1066947865 │ 2021-02-04T15:49:34.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 32c69d68-1792-53af-9f42-0e97c9afc94b │ Count Matrix          │\n",
+      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
+      "│       7 │ dc31f31d-ab56-4025-9834-99be638a2d50.loom │  745509487 │ 2021-02-04T15:34:25.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 38a0ef48-9df1-5fef-8eb6-b32fbb67aabd │ Count Matrix          │\n",
+      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
+      "│       8 │ afd0ea55-e710-4b46-bb05-2423e491b6f5.loom │  698042665 │ 2021-02-04T15:34:36.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 3cd78fb7-b7bc-5ab1-b122-47585f0023d4 │ Count Matrix          │\n",
+      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
+      "│       9 │ b3ce1085-08dc-42ff-a609-6968315327a8.loom │  425012253 │ 2021-02-04T15:30:01.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 44175006-91f3-5d95-9a08-b33e08ed1ae3 │ Count Matrix          │\n",
+      "╘═════════╧═══════════════════════════════════════════╧════════════╧═════════════════════════════╧════════════════════════════╧══════════════════════════════════════╧══════════════════════════════════════╧═══════════════════════╛\n",
+      "Downloading to: /home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data\n"
+     ]
+    },
+    {
+     "ename": "IsADirectoryError",
+     "evalue": "[Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIsADirectoryError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[75], line 12\u001b[0m\n\u001b[1;32m      9\u001b[0m catalog \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mdcp26\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m     10\u001b[0m save_location \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m---> 12\u001b[0m download_file(index, save_location,files_list)\n",
+      "Cell \u001b[0;32mIn[68], line 16\u001b[0m, in \u001b[0;36mdownload_file\u001b[0;34m(index, output_path, files_list)\u001b[0m\n\u001b[1;32m     13\u001b[0m total \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(response\u001b[39m.\u001b[39mheaders\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mcontent-length\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m0\u001b[39m))\n\u001b[1;32m     14\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mDownloading to: \u001b[39m\u001b[39m{\u001b[39;00moutput_path\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m, flush\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m---> 16\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(output_path, \u001b[39m'\u001b[39;49m\u001b[39mwb\u001b[39;49m\u001b[39m'\u001b[39;49m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m     17\u001b[0m     \u001b[39mwith\u001b[39;00m tqdm(total\u001b[39m=\u001b[39mtotal, unit\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mB\u001b[39m\u001b[39m'\u001b[39m, unit_scale\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, unit_divisor\u001b[39m=\u001b[39m\u001b[39m1024\u001b[39m) \u001b[39mas\u001b[39;00m bar:\n\u001b[1;32m     18\u001b[0m         \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m response\u001b[39m.\u001b[39miter_content(chunk_size\u001b[39m=\u001b[39m\u001b[39m1024\u001b[39m):\n",
+      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m    277\u001b[0m \u001b[39mif\u001b[39;00m file \u001b[39min\u001b[39;00m {\u001b[39m0\u001b[39m, \u001b[39m1\u001b[39m, \u001b[39m2\u001b[39m}:\n\u001b[1;32m    278\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m    279\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mIPython won\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt let you open fd=\u001b[39m\u001b[39m{\u001b[39;00mfile\u001b[39m}\u001b[39;00m\u001b[39m by default \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    280\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    281\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39myou can use builtins\u001b[39m\u001b[39m'\u001b[39m\u001b[39m open.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    282\u001b[0m     )\n\u001b[0;32m--> 284\u001b[0m \u001b[39mreturn\u001b[39;00m io_open(file, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
+      "\u001b[0;31mIsADirectoryError\u001b[0m: [Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'"
+     ]
+    }
+   ],
+   "source": [
+    "# example workflow\n",
+    "# get file metadata \n",
+    "antwort = get_files_metadata()\n",
+    "# transform & print it as list\n",
+    "files_list = get_file_list(antwort)\n",
+    "print_file_table(files_list)\n",
+    "# specifiy which file(s) you want to download (provide index-list)\n",
+    "index = [6]\n",
+    "catalog = 'dcp26'\n",
+    "save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'\n",
+    "\n",
+    "download_file(index, save_location,files_list)\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Weiterführende Notizen\n",
+    "\n",
+    "https://www.askpython.com/python/examples/pull-data-from-an-api"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Loom Dateien: \n",
+    "\n",
+    "Umgang mit LoomPy: \n",
+    "\n",
+    "https://linnarssonlab.org/loompy/apiwalkthrough/index.html"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}