API_access.ipynb

{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " # download loom files via api \n",
    " \n",
    "### see api documentation\n",
    "[documentation](https://service.azul.data.humancellatlas.org/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import os\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "import pandas as pd\n",
    "from tabulate import tabulate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "# example function, downloading a single file which was specified before \n",
    "# modified! \n",
    "# TODO make this function work. Add file name to output_pat (see example code)\n",
    "\n",
    "def download_file(index, output_path, files_list):\n",
    "    for i in index: \n",
    "\n",
    "        url_tp = files_list[i]['url']\n",
    "        url = url_tp.replace('/fetch', '')  # Work around https://github.com/DataBiosphere/azul/issues/2908\n",
    "    \n",
    "        response = requests.get(url, stream=True)\n",
    "        response.raise_for_status()\n",
    "    \n",
    "        total = int(response.headers.get('content-length', 0))\n",
    "        print(f'Downloading to: {output_path}', flush=True)\n",
    "    \n",
    "        with open(output_path, 'wb') as f:\n",
    "            with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:\n",
    "                for chunk in response.iter_content(chunk_size=1024):\n",
    "                    size = f.write(chunk)\n",
    "                    bar.update(size)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "project_uuid = '4a95101c-9ffc-4f30-a809-f04518a23803'\n",
    "catalog = 'dcp26'\n",
    "endpoint_url = f'https://service.azul.data.humancellatlas.org/index/projects/{project_uuid}'\n",
    "\n",
    "save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloads Complete.\n"
     ]
    }
   ],
   "source": [
    "# example downoading of a project\n",
    "\n",
    "response = requests.get(endpoint_url, params={'catalog': catalog,})\n",
    "response.raise_for_status()\n",
    "response_json = response.json()\n",
    "project = response_json['projects'][0]\n",
    "\n",
    "file_urls = set()\n",
    "for key in ('matrices', 'contributedAnalyses'):\n",
    "    tree = project[key]\n",
    "    for path, file_info in iterate_matrices_tree(tree):\n",
    "        url = file_info['url']\n",
    "        if url not in file_urls:\n",
    "            dest_path = os.path.join(save_location, file_info['name'])\n",
    "            # TODO uncomment the folllowing line if you really want to download data: \n",
    "            # download_file(url, dest_path)\n",
    "            file_urls.add(url)\n",
    "print('Downloads Complete.')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "# function gets meta data of loom files\n",
    "# TODO make default params and abiltiy to passing params to the function\n",
    "def get_files_metadata():\n",
    "    params = {\n",
    "        'catalog': 'dcp26',\n",
    "        'filters': '{\"fileFormat\": {\"is\": [\"loom\",\"loom.gz\"]}}',\n",
    "        'size': 10,\n",
    "        'sort': 'lastModifiedDate',\n",
    "        'order': 'asc'\n",
    "    }\n",
    "    url = f'https://service.azul.data.humancellatlas.org/index/files'\n",
    "    response = requests.get(url, params=params)\n",
    "\n",
    "    # Check the response status code\n",
    "    if response.status_code == 200:\n",
    "        # Request was successful\n",
    "        response_json = response.json()\n",
    "        return response_json\n",
    "    else:\n",
    "        # An error occurred\n",
    "        print(\"Error:\", response.status_code)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def get_file_list(response_json):\n",
    "    # extract file-information into a list of dicts\n",
    "    file_data = []  #list\n",
    "    for hit in response_json['hits']:\n",
    "        for file in hit['files']:\n",
    "            file_dict = {\n",
    "                'fileName':file['name'],\n",
    "                'size':file['size'],\n",
    "                'version':file['version'],\n",
    "                'projectShortname':hit['projects'][0]['projectShortname'][0],\n",
    "                'projectId':hit['projects'][0]['projectId'][0],\n",
    "                'entryId':hit['entryId'],\n",
    "                'contentDescription':file['contentDescription'][0],\n",
    "                'url':file['url']\n",
    "            }\n",
    "            file_data.append(file_dict)\n",
    "    file_data_with_index = [{\n",
    "        'index': i,\n",
    "        **file_dict\n",
    "    } for i, file_dict in enumerate(file_data)]\n",
    "    return file_data_with_index\n",
    "\n",
    "def print_file_table(response_json):\n",
    "\n",
    "    #file_data = get_file_list(response_json)\n",
    "    file_data = response_json\n",
    "    \n",
    "    # create nested list out of file_data\n",
    "    headers = [\"Index\", \"File Name\" , \"Size\", \"Version\", \"Project (shortname)\", \"ProjectID\", \n",
    "               \"EntryId\", \"Content Description\"]\n",
    "    table_data = [[\n",
    "        file['index'],\n",
    "        file['fileName'],\n",
    "        file['size'],\n",
    "        file['version'],\n",
    "        file['projectShortname'],\n",
    "        file['projectId'],\n",
    "        file['entryId'],\n",
    "        file['contentDescription'],\n",
    "    ] for file in file_data]\n",
    "\n",
    "    table = tabulate(table_data, headers, tablefmt='fancy_grid')\n",
    "    print(table)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "╒═════════╤═══════════════════════════════════════════╤════════════╤═════════════════════════════╤════════════════════════════╤══════════════════════════════════════╤══════════════════════════════════════╤═══════════════════════╕\n",
      "│   Index │ File Name                                 │       Size │ Version                     │ Project (shortname)        │ ProjectID                            │ EntryId                              │ Content Description   │\n",
      "╞═════════╪═══════════════════════════════════════════╪════════════╪═════════════════════════════╪════════════════════════════╪══════════════════════════════════════╪══════════════════════════════════════╪═══════════════════════╡\n",
      "│       0 │ 098cc66a-d806-42db-a1c8-fa99a0317d7c.loom │  854692581 │ 2021-02-03T19:43:20.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 131ea511-25f7-5801-993f-bfa25f8ca68d │ Count Matrix          │\n",
      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
      "│       1 │ 294fe5d9-c1e8-4670-80d3-4c2b0a5e33c1.loom │ 1530326527 │ 2021-02-03T19:51:58.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ 7848d80b-6b1d-56b5-b19a-9639e3c4efbe │ Count Matrix          │\n",
      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
      "│       2 │ d6536459-ab4e-4954-a0ce-5e6d07670039.loom │  938504115 │ 2021-02-03T19:44:43.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ b98cfaac-64f5-59f5-b42e-209186812c19 │ Count Matrix          │\n",
      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
      "│       3 │ a040dae6-e0b1-49cf-a9ee-9793d5ad7d9c.loom │ 1478984890 │ 2021-02-03T19:49:15.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ c7b6470c-e2f0-5141-a8a2-11eb0984689a │ Count Matrix          │\n",
      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
      "│       4 │ t-cell-activation-human-lung-10XV2.loom   │  395054566 │ 2021-02-10T18:04:33.000000Z │ HumanTissueTcellActivation │ 4a95101c-9ffc-4f30-a809-f04518a23803 │ d0b95f2c-98ae-582b-84f4-e2bd0c5a0adb │ Count Matrix          │\n",
      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
      "│       5 │ 0f14c412-5014-4ac0-9a71-858b2f047777.loom │  423142737 │ 2021-02-04T15:18:49.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 2b9f7c89-d1c2-53ef-a769-80fec2f7d9e6 │ Count Matrix          │\n",
      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
      "│       6 │ 37cad11b-c8c9-4d1f-b715-498b0f8d4b35.loom │ 1066947865 │ 2021-02-04T15:49:34.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 32c69d68-1792-53af-9f42-0e97c9afc94b │ Count Matrix          │\n",
      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
      "│       7 │ dc31f31d-ab56-4025-9834-99be638a2d50.loom │  745509487 │ 2021-02-04T15:34:25.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 38a0ef48-9df1-5fef-8eb6-b32fbb67aabd │ Count Matrix          │\n",
      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
      "│       8 │ afd0ea55-e710-4b46-bb05-2423e491b6f5.loom │  698042665 │ 2021-02-04T15:34:36.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 3cd78fb7-b7bc-5ab1-b122-47585f0023d4 │ Count Matrix          │\n",
      "├─────────┼───────────────────────────────────────────┼────────────┼─────────────────────────────┼────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────┼───────────────────────┤\n",
      "│       9 │ b3ce1085-08dc-42ff-a609-6968315327a8.loom │  425012253 │ 2021-02-04T15:30:01.000000Z │ KidneySingleCellAtlas      │ abe1a013-af7a-45ed-8c26-f3793c24a1f4 │ 44175006-91f3-5d95-9a08-b33e08ed1ae3 │ Count Matrix          │\n",
      "╘═════════╧═══════════════════════════════════════════╧════════════╧═════════════════════════════╧════════════════════════════╧══════════════════════════════════════╧══════════════════════════════════════╧═══════════════════════╛\n",
      "Downloading to: /home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data\n"
     ]
    },
    {
     "ename": "IsADirectoryError",
     "evalue": "[Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIsADirectoryError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[75], line 12\u001b[0m\n\u001b[1;32m      9\u001b[0m catalog \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mdcp26\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m     10\u001b[0m save_location \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m---> 12\u001b[0m download_file(index, save_location,files_list)\n",
      "Cell \u001b[0;32mIn[68], line 16\u001b[0m, in \u001b[0;36mdownload_file\u001b[0;34m(index, output_path, files_list)\u001b[0m\n\u001b[1;32m     13\u001b[0m total \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(response\u001b[39m.\u001b[39mheaders\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mcontent-length\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m0\u001b[39m))\n\u001b[1;32m     14\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mDownloading to: \u001b[39m\u001b[39m{\u001b[39;00moutput_path\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m, flush\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m---> 16\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(output_path, \u001b[39m'\u001b[39;49m\u001b[39mwb\u001b[39;49m\u001b[39m'\u001b[39;49m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m     17\u001b[0m     \u001b[39mwith\u001b[39;00m tqdm(total\u001b[39m=\u001b[39mtotal, unit\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mB\u001b[39m\u001b[39m'\u001b[39m, unit_scale\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, unit_divisor\u001b[39m=\u001b[39m\u001b[39m1024\u001b[39m) \u001b[39mas\u001b[39;00m bar:\n\u001b[1;32m     18\u001b[0m         \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m response\u001b[39m.\u001b[39miter_content(chunk_size\u001b[39m=\u001b[39m\u001b[39m1024\u001b[39m):\n",
      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m    277\u001b[0m \u001b[39mif\u001b[39;00m file \u001b[39min\u001b[39;00m {\u001b[39m0\u001b[39m, \u001b[39m1\u001b[39m, \u001b[39m2\u001b[39m}:\n\u001b[1;32m    278\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m    279\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mIPython won\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt let you open fd=\u001b[39m\u001b[39m{\u001b[39;00mfile\u001b[39m}\u001b[39;00m\u001b[39m by default \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    280\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    281\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39myou can use builtins\u001b[39m\u001b[39m'\u001b[39m\u001b[39m open.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    282\u001b[0m     )\n\u001b[0;32m--> 284\u001b[0m \u001b[39mreturn\u001b[39;00m io_open(file, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "\u001b[0;31mIsADirectoryError\u001b[0m: [Errno 21] Is a directory: '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'"
     ]
    }
   ],
   "source": [
    "# example workflow\n",
    "# get file metadata \n",
    "antwort = get_files_metadata()\n",
    "# transform & print it as list\n",
    "files_list = get_file_list(antwort)\n",
    "print_file_table(files_list)\n",
    "# specifiy which file(s) you want to download (provide index-list)\n",
    "index = [6]\n",
    "catalog = 'dcp26'\n",
    "save_location = '/home/fran/Documents/AAMasterDataScience/BigDataPraktikum/data'\n",
    "\n",
    "download_file(index, save_location,files_list)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Weiterführende Notizen\n",
    "\n",
    "https://www.askpython.com/python/examples/pull-data-from-an-api"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Loom Dateien: \n",
    "\n",
    "Umgang mit LoomPy: \n",
    "\n",
    "https://linnarssonlab.org/loompy/apiwalkthrough/index.html"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}