{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import scanpy as sc\n", "import scvelo as scv\n", "import numpy as np\n", "import collections" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(23914, 5)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Unnamed: 0</th>\n", " <th>cell</th>\n", " <th>treatment</th>\n", " <th>replicate</th>\n", " <th>barcodes</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>SCC1</td>\n", " <td>CTX</td>\n", " <td>R1</td>\n", " <td>AAACCTGAGTTGTCGT</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>SCC1</td>\n", " <td>CTX</td>\n", " <td>R1</td>\n", " <td>AAACCTGCACAGCGTC</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>SCC1</td>\n", " <td>CTX</td>\n", " <td>R1</td>\n", " <td>AAACCTGGTACTTCTT</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>SCC1</td>\n", " <td>CTX</td>\n", " <td>R1</td>\n", " <td>AAAGATGGTGTGAATA</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>SCC1</td>\n", " <td>CTX</td>\n", " <td>R1</td>\n", " <td>AAAGCAAGTAGCTGCC</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Unnamed: 0 cell treatment replicate barcodes\n", "0 0 SCC1 CTX R1 AAACCTGAGTTGTCGT\n", "1 1 SCC1 CTX R1 AAACCTGCACAGCGTC\n", "2 2 SCC1 CTX R1 AAACCTGGTACTTCTT\n", "3 3 SCC1 CTX R1 AAAGATGGTGTGAATA\n", "4 4 SCC1 CTX R1 AAAGCAAGTAGCTGCC" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "phenoDat = pd.read_csv('phenoSCCall.csv')\n", "print(phenoDat.shape)\n", "phenoDat.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "phenoDat.drop('Unnamed: 0',axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "ename": "MemoryError", "evalue": "Unable to allocate array with shape (396045101,) and data type float32", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mMemoryError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-34-dc4042396d9a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0madata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_h5ad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'processed_adata.h5ad'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0madata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/envs/Fertig_Python_3_7/lib/python3.7/site-packages/anndata/readwrite/read.py\u001b[0m in \u001b[0;36mread_h5ad\u001b[0;34m(filename, backed, chunk_size)\u001b[0m\n\u001b[1;32m 450\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mX\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[0mdtype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;31m# maintain dtype, since 0.7\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 452\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mAnnData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0m_read_args_from_h5ad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 453\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 454\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/envs/Fertig_Python_3_7/lib/python3.7/site-packages/anndata/readwrite/read.py\u001b[0m in \u001b[0;36m_read_args_from_h5ad\u001b[0;34m(adata, filename, mode, chunk_size)\u001b[0m\n\u001b[1;32m 484\u001b[0m \u001b[0md\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 486\u001b[0;31m \u001b[0m_read_key_value_from_h5\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0md\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 487\u001b[0m \u001b[0;31m# backwards compat: save X with the correct name\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 488\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'X'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0md\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/envs/Fertig_Python_3_7/lib/python3.7/site-packages/anndata/readwrite/read.py\u001b[0m in \u001b[0;36m_read_key_value_from_h5\u001b[0;34m(f, d, key, key_write, chunk_size)\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0mds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_direct\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 521\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh5py\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSparseDataset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 522\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 523\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/envs/Fertig_Python_3_7/lib/python3.7/site-packages/anndata/h5py/h5sparse.py\u001b[0m in \u001b[0;36mvalue\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 313\u001b[0m \u001b[0mobject\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mh5py_group\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 314\u001b[0m \u001b[0mdata_array\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 315\u001b[0;31m \u001b[0mdata_array\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mempty\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobject\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'data'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'data'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 316\u001b[0m \u001b[0mdata_array\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindices\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mempty\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobject\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'indices'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'indices'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0mdata_array\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindptr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mempty\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobject\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'indptr'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'indptr'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mMemoryError\u001b[0m: Unable to allocate array with shape (396045101,) and data type float32" ] } ], "source": [ "adata = sc.read_h5ad('adataWithVelocity.h5ad')\n", "adata" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['AAACCTGAGCGATTCT',\n", " 'AAACCTGAGCTTTGGT',\n", " 'AAACCTGAGGCCCTCA',\n", " 'AAACCTGAGTAGCCGA',\n", " 'AAACCTGGTAGCGCTC']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "indices = []\n", "for i,idx in enumerate(adata.obs.index):\n", " indices.append(idx[:16])\n", "indices[:5]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>folder</th>\n", " <th>n_genes</th>\n", " <th>initial_size_spliced</th>\n", " <th>initial_size_unspliced</th>\n", " <th>initial_size</th>\n", " <th>n_counts</th>\n", " <th>velocity_self_transition</th>\n", " </tr>\n", " <tr>\n", " <th>index</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>AAACCTGAGCGATTCT-0</th>\n", " <td>S4_L007</td>\n", " <td>2596</td>\n", " <td>9959.0</td>\n", " <td>2476.0</td>\n", " <td>9959.0</td>\n", " <td>4728.473145</td>\n", " <td>0.380558</td>\n", " </tr>\n", " <tr>\n", " <th>AAACCTGAGCTTTGGT-0</th>\n", " <td>S4_L007</td>\n", " <td>329</td>\n", " <td>561.0</td>\n", " <td>189.0</td>\n", " <td>561.0</td>\n", " <td>4725.589844</td>\n", " <td>0.338164</td>\n", " </tr>\n", " <tr>\n", " <th>AAACCTGAGGCCCTCA-0</th>\n", " <td>S4_L007</td>\n", " <td>2178</td>\n", " <td>7789.0</td>\n", " <td>1995.0</td>\n", " <td>7789.0</td>\n", " <td>4761.850586</td>\n", " <td>0.306395</td>\n", " </tr>\n", " <tr>\n", " <th>AAACCTGAGTAGCCGA-0</th>\n", " <td>S4_L007</td>\n", " <td>253</td>\n", " <td>539.0</td>\n", " <td>597.0</td>\n", " <td>539.0</td>\n", " <td>3514.505371</td>\n", " <td>0.153943</td>\n", " </tr>\n", " <tr>\n", " <th>AAACCTGGTAGCGCTC-0</th>\n", " <td>S4_L007</td>\n", " <td>420</td>\n", " <td>878.0</td>\n", " <td>1053.0</td>\n", " <td>878.0</td>\n", " <td>3543.319092</td>\n", " <td>0.207334</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " folder n_genes initial_size_spliced \\\n", "index \n", "AAACCTGAGCGATTCT-0 S4_L007 2596 9959.0 \n", "AAACCTGAGCTTTGGT-0 S4_L007 329 561.0 \n", "AAACCTGAGGCCCTCA-0 S4_L007 2178 7789.0 \n", "AAACCTGAGTAGCCGA-0 S4_L007 253 539.0 \n", "AAACCTGGTAGCGCTC-0 S4_L007 420 878.0 \n", "\n", " initial_size_unspliced initial_size n_counts \\\n", "index \n", "AAACCTGAGCGATTCT-0 2476.0 9959.0 4728.473145 \n", "AAACCTGAGCTTTGGT-0 189.0 561.0 4725.589844 \n", "AAACCTGAGGCCCTCA-0 1995.0 7789.0 4761.850586 \n", "AAACCTGAGTAGCCGA-0 597.0 539.0 3514.505371 \n", "AAACCTGGTAGCGCTC-0 1053.0 878.0 3543.319092 \n", "\n", " velocity_self_transition \n", "index \n", "AAACCTGAGCGATTCT-0 0.380558 \n", "AAACCTGAGCTTTGGT-0 0.338164 \n", "AAACCTGAGGCCCTCA-0 0.306395 \n", "AAACCTGAGTAGCCGA-0 0.153943 \n", "AAACCTGGTAGCGCTC-0 0.207334 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata.obs.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "adata.obs.set_index(pd.Index(indices),inplace=True)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(177151, 11)" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = adata.obs.join(phenoDat.set_index('barcodes'),how='inner')\n", "df.shape" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "adata2 = adata" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "23585" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(df.index))" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "172419" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common = []\n", "for x in indices:\n", " common.append(x in phenoDat['barcodes'].values)\n", "sum(common)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "33209" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(indices))" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('AAACCTGAGCGATTCT', 'AAACCTGAGTTGTCGT')" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "indices[0],phenoDat['barcodes']" ] } ], "metadata": { "kernelspec": { "display_name": "(Fertig) Python 3.7", "language": "python", "name": "fertig_python_3_7" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }