{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader\n",
    "from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from pathlib import Path\n",
    "import random\n",
    "\n",
    "## Input data directory\n",
    "data_dir = \"cureus\"\n",
    "inputdirectory = Path(f\"./data_input/{data_dir}\")\n",
    "## This is where the output csv files will be written\n",
    "out_dir = data_dir\n",
    "outputdirectory = Path(f\"./data_output/{out_dir}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1/1 [00:01<00:00,  1.82s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of chunks =  23\n",
      "An extensive literature search was performed, and 56 articles published in peer-reviewed journals between 2005 and 2021 were selected and analyzed. The corresponding authors' experiential knowledge served as the foundation for the analysis.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "## Dir PDF Loader\n",
    "# loader = PyPDFDirectoryLoader(inputdirectory)\n",
    "## File Loader\n",
    "# loader = PyPDFLoader(\"./data/MedicalDocuments/orf-path_health-n1.pdf\")\n",
    "loader = DirectoryLoader(inputdirectory, show_progress=True)\n",
    "documents = loader.load()\n",
    "\n",
    "splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size=1500,\n",
    "    chunk_overlap=150,\n",
    "    length_function=len,\n",
    "    is_separator_regex=False,\n",
    ")\n",
    "\n",
    "pages = splitter.split_documents(documents)\n",
    "print(\"Number of chunks = \", len(pages))\n",
    "print(pages[3].page_content)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a dataframe of all the chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(23, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>source</th>\n",
       "      <th>chunk_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Abstract India’s health indicators have improv...</td>\n",
       "      <td>data_input/cureus/cureus-0015-00000040274.txt</td>\n",
       "      <td>0f56d8fbefa04f1e877f573938f78ff1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Categories: Public Health, Epidemiology/Public...</td>\n",
       "      <td>data_input/cureus/cureus-0015-00000040274.txt</td>\n",
       "      <td>92789b719a254c8385327b9d243935b6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Introduction And Background India’s health ind...</td>\n",
       "      <td>data_input/cureus/cureus-0015-00000040274.txt</td>\n",
       "      <td>9eefb3bf352a459c8895f272b632724e</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>An extensive literature search was performed, ...</td>\n",
       "      <td>data_input/cureus/cureus-0015-00000040274.txt</td>\n",
       "      <td>7c21bdb708d14855b7b3de9d8564b175</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Review Overview of the public and private heal...</td>\n",
       "      <td>data_input/cureus/cureus-0015-00000040274.txt</td>\n",
       "      <td>bfc37e1213e7428d963fdac63eb80079</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text  \\\n",
       "0  Abstract India’s health indicators have improv...   \n",
       "1  Categories: Public Health, Epidemiology/Public...   \n",
       "2  Introduction And Background India’s health ind...   \n",
       "3  An extensive literature search was performed, ...   \n",
       "4  Review Overview of the public and private heal...   \n",
       "\n",
       "                                          source  \\\n",
       "0  data_input/cureus/cureus-0015-00000040274.txt   \n",
       "1  data_input/cureus/cureus-0015-00000040274.txt   \n",
       "2  data_input/cureus/cureus-0015-00000040274.txt   \n",
       "3  data_input/cureus/cureus-0015-00000040274.txt   \n",
       "4  data_input/cureus/cureus-0015-00000040274.txt   \n",
       "\n",
       "                           chunk_id  \n",
       "0  0f56d8fbefa04f1e877f573938f78ff1  \n",
       "1  92789b719a254c8385327b9d243935b6  \n",
       "2  9eefb3bf352a459c8895f272b632724e  \n",
       "3  7c21bdb708d14855b7b3de9d8564b175  \n",
       "4  bfc37e1213e7428d963fdac63eb80079  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from helpers.df_helpers import documents2Dataframe\n",
    "df = documents2Dataframe(pages)\n",
    "print(df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract Concepts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "## This function uses the helpers/prompt function to extract concepts from text\n",
    "from helpers.df_helpers import df2Graph\n",
    "from helpers.df_helpers import graph2Df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. \n",
    "\n",
    "        dfne = dataframe of edges\n",
    "\n",
    "        df = dataframe of chunks\n",
    "\n",
    "\n",
    "Else the dataframes are read from the output directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(149, 5)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node_1</th>\n",
       "      <th>node_2</th>\n",
       "      <th>edge</th>\n",
       "      <th>chunk_id</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>india's health indicators</td>\n",
       "      <td>peer nations</td>\n",
       "      <td>continue to lag behind</td>\n",
       "      <td>ae0fd26675d645e787964255667e90f4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>health workers density</td>\n",
       "      <td>doctors and nurses/midwives</td>\n",
       "      <td>for 10,00 persons</td>\n",
       "      <td>ae0fd26675d645e787964255667e90f4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>skilled health workforce</td>\n",
       "      <td>india</td>\n",
       "      <td>reinforces the central role human resources ha...</td>\n",
       "      <td>ae0fd26675d645e787964255667e90f4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>skewed inter-state</td>\n",
       "      <td>urban-rural</td>\n",
       "      <td>and public-private sector divide</td>\n",
       "      <td>ae0fd26675d645e787964255667e90f4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>health budget</td>\n",
       "      <td>federal</td>\n",
       "      <td>offers an unprecedented opportunity to do this</td>\n",
       "      <td>ae0fd26675d645e787964255667e90f4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      node_1                       node_2  \\\n",
       "0  india's health indicators                 peer nations   \n",
       "2     health workers density  doctors and nurses/midwives   \n",
       "4   skilled health workforce                        india   \n",
       "5         skewed inter-state                  urban-rural   \n",
       "7              health budget                      federal   \n",
       "\n",
       "                                                edge  \\\n",
       "0                             continue to lag behind   \n",
       "2                                  for 10,00 persons   \n",
       "4  reinforces the central role human resources ha...   \n",
       "5                   and public-private sector divide   \n",
       "7     offers an unprecedented opportunity to do this   \n",
       "\n",
       "                           chunk_id  count  \n",
       "0  ae0fd26675d645e787964255667e90f4      4  \n",
       "2  ae0fd26675d645e787964255667e90f4      4  \n",
       "4  ae0fd26675d645e787964255667e90f4      4  \n",
       "5  ae0fd26675d645e787964255667e90f4      4  \n",
       "7  ae0fd26675d645e787964255667e90f4      4  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## To regenerate the graph with LLM, set this to True\n",
    "regenerate = False\n",
    "\n",
    "if regenerate:\n",
    "    concepts_list = df2Graph(df, model='zephyr:latest')\n",
    "    dfg1 = graph2Df(concepts_list)\n",
    "    if not os.path.exists(outputdirectory):\n",
    "        os.makedirs(outputdirectory)\n",
    "    \n",
    "    dfg1.to_csv(outputdirectory/\"graph.csv\", sep=\"|\", index=False)\n",
    "    df.to_csv(outputdirectory/\"chunks.csv\", sep=\"|\", index=False)\n",
    "else:\n",
    "    dfg1 = pd.read_csv(outputdirectory/\"graph.csv\", sep=\"|\")\n",
    "\n",
    "dfg1.replace(\"\", np.nan, inplace=True)\n",
    "dfg1.dropna(subset=[\"node_1\", \"node_2\", 'edge'], inplace=True)\n",
    "dfg1['count'] = 4 \n",
    "## Increasing the weight of the relation to 4. \n",
    "## We will assign the weight of 1 when later the contextual proximity will be calculated.  \n",
    "print(dfg1.shape)\n",
    "dfg1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculating contextual proximity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node_1</th>\n",
       "      <th>node_2</th>\n",
       "      <th>chunk_id</th>\n",
       "      <th>count</th>\n",
       "      <th>edge</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2827</th>\n",
       "      <td>world-class health facilities</td>\n",
       "      <td>nhm strategies</td>\n",
       "      <td>0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...</td>\n",
       "      <td>10</td>\n",
       "      <td>contextual proximity</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2828</th>\n",
       "      <td>world-class health facilities</td>\n",
       "      <td>rural areas</td>\n",
       "      <td>0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...</td>\n",
       "      <td>2</td>\n",
       "      <td>contextual proximity</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2829</th>\n",
       "      <td>world-class health facilities</td>\n",
       "      <td>social norms</td>\n",
       "      <td>0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...</td>\n",
       "      <td>2</td>\n",
       "      <td>contextual proximity</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2830</th>\n",
       "      <td>world-class health facilities</td>\n",
       "      <td>urban areas</td>\n",
       "      <td>0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...</td>\n",
       "      <td>2</td>\n",
       "      <td>contextual proximity</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2831</th>\n",
       "      <td>world-class health facilities</td>\n",
       "      <td>urban slums</td>\n",
       "      <td>0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...</td>\n",
       "      <td>2</td>\n",
       "      <td>contextual proximity</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             node_1          node_2  \\\n",
       "2827  world-class health facilities  nhm strategies   \n",
       "2828  world-class health facilities     rural areas   \n",
       "2829  world-class health facilities    social norms   \n",
       "2830  world-class health facilities     urban areas   \n",
       "2831  world-class health facilities     urban slums   \n",
       "\n",
       "                                               chunk_id  count  \\\n",
       "2827  0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...     10   \n",
       "2828  0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...      2   \n",
       "2829  0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...      2   \n",
       "2830  0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...      2   \n",
       "2831  0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...      2   \n",
       "\n",
       "                      edge  \n",
       "2827  contextual proximity  \n",
       "2828  contextual proximity  \n",
       "2829  contextual proximity  \n",
       "2830  contextual proximity  \n",
       "2831  contextual proximity  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:\n",
    "    ## Melt the dataframe into a list of nodes\n",
    "    dfg_long = pd.melt(\n",
    "        df, id_vars=[\"chunk_id\"], value_vars=[\"node_1\", \"node_2\"], value_name=\"node\"\n",
    "    )\n",
    "    dfg_long.drop(columns=[\"variable\"], inplace=True)\n",
    "    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.\n",
    "    dfg_wide = pd.merge(dfg_long, dfg_long, on=\"chunk_id\", suffixes=(\"_1\", \"_2\"))\n",
    "    # drop self loops\n",
    "    self_loops_drop = dfg_wide[dfg_wide[\"node_1\"] == dfg_wide[\"node_2\"]].index\n",
    "    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)\n",
    "    ## Group and count edges.\n",
    "    dfg2 = (\n",
    "        dfg2.groupby([\"node_1\", \"node_2\"])\n",
    "        .agg({\"chunk_id\": [\",\".join, \"count\"]})\n",
    "        .reset_index()\n",
    "    )\n",
    "    dfg2.columns = [\"node_1\", \"node_2\", \"chunk_id\", \"count\"]\n",
    "    dfg2.replace(\"\", np.nan, inplace=True)\n",
    "    dfg2.dropna(subset=[\"node_1\", \"node_2\"], inplace=True)\n",
    "    # Drop edges with 1 count\n",
    "    dfg2 = dfg2[dfg2[\"count\"] != 1]\n",
    "    dfg2[\"edge\"] = \"contextual proximity\"\n",
    "    return dfg2\n",
    "\n",
    "\n",
    "dfg2 = contextual_proximity(dfg1)\n",
    "dfg2.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merge both the dataframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node_1</th>\n",
       "      <th>node_2</th>\n",
       "      <th>chunk_id</th>\n",
       "      <th>edge</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>56 articles</td>\n",
       "      <td>extensive literature search</td>\n",
       "      <td>d7a3e5085c7f4de4bc28fb0bd9cb0a94,d7a3e5085c7f4...</td>\n",
       "      <td>contextual proximity</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[54]</td>\n",
       "      <td>increasing violence against healthcare personnel</td>\n",
       "      <td>640835e2521045a395ab6465cc1ba4ca,640835e252104...</td>\n",
       "      <td>contextual proximity</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[55]</td>\n",
       "      <td>increasing violence against healthcare personnel</td>\n",
       "      <td>640835e2521045a395ab6465cc1ba4ca,640835e252104...</td>\n",
       "      <td>contextual proximity</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>a bad situation</td>\n",
       "      <td>increasing violence against healthcare personnel</td>\n",
       "      <td>640835e2521045a395ab6465cc1ba4ca,640835e252104...</td>\n",
       "      <td>contextual proximity</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>a worrisome new trend</td>\n",
       "      <td>increasing violence against healthcare personnel</td>\n",
       "      <td>640835e2521045a395ab6465cc1ba4ca,640835e252104...</td>\n",
       "      <td>contextual proximity</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>753</th>\n",
       "      <td>world-class health facilities</td>\n",
       "      <td>nhm strategies</td>\n",
       "      <td>0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...</td>\n",
       "      <td>contextual proximity</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>754</th>\n",
       "      <td>world-class health facilities</td>\n",
       "      <td>rural areas</td>\n",
       "      <td>0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...</td>\n",
       "      <td>contextual proximity</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>755</th>\n",
       "      <td>world-class health facilities</td>\n",
       "      <td>social norms</td>\n",
       "      <td>0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...</td>\n",
       "      <td>contextual proximity</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>756</th>\n",
       "      <td>world-class health facilities</td>\n",
       "      <td>urban areas</td>\n",
       "      <td>0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...</td>\n",
       "      <td>contextual proximity</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>757</th>\n",
       "      <td>world-class health facilities</td>\n",
       "      <td>urban slums</td>\n",
       "      <td>0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...</td>\n",
       "      <td>contextual proximity</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>758 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            node_1  \\\n",
       "0                      56 articles   \n",
       "1                             [54]   \n",
       "2                             [55]   \n",
       "3                  a bad situation   \n",
       "4            a worrisome new trend   \n",
       "..                             ...   \n",
       "753  world-class health facilities   \n",
       "754  world-class health facilities   \n",
       "755  world-class health facilities   \n",
       "756  world-class health facilities   \n",
       "757  world-class health facilities   \n",
       "\n",
       "                                               node_2  \\\n",
       "0                         extensive literature search   \n",
       "1    increasing violence against healthcare personnel   \n",
       "2    increasing violence against healthcare personnel   \n",
       "3    increasing violence against healthcare personnel   \n",
       "4    increasing violence against healthcare personnel   \n",
       "..                                                ...   \n",
       "753                                    nhm strategies   \n",
       "754                                       rural areas   \n",
       "755                                      social norms   \n",
       "756                                       urban areas   \n",
       "757                                       urban slums   \n",
       "\n",
       "                                              chunk_id                  edge  \\\n",
       "0    d7a3e5085c7f4de4bc28fb0bd9cb0a94,d7a3e5085c7f4...  contextual proximity   \n",
       "1    640835e2521045a395ab6465cc1ba4ca,640835e252104...  contextual proximity   \n",
       "2    640835e2521045a395ab6465cc1ba4ca,640835e252104...  contextual proximity   \n",
       "3    640835e2521045a395ab6465cc1ba4ca,640835e252104...  contextual proximity   \n",
       "4    640835e2521045a395ab6465cc1ba4ca,640835e252104...  contextual proximity   \n",
       "..                                                 ...                   ...   \n",
       "753  0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...  contextual proximity   \n",
       "754  0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...  contextual proximity   \n",
       "755  0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...  contextual proximity   \n",
       "756  0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...  contextual proximity   \n",
       "757  0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...  contextual proximity   \n",
       "\n",
       "     count  \n",
       "0        2  \n",
       "1        2  \n",
       "2        2  \n",
       "3        2  \n",
       "4        2  \n",
       "..     ...  \n",
       "753     10  \n",
       "754      2  \n",
       "755      2  \n",
       "756      2  \n",
       "757      2  \n",
       "\n",
       "[758 rows x 5 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfg = pd.concat([dfg1, dfg2], axis=0)\n",
    "dfg = (\n",
    "    dfg.groupby([\"node_1\", \"node_2\"])\n",
    "    .agg({\"chunk_id\": \",\".join, \"edge\": ','.join, 'count': 'sum'})\n",
    "    .reset_index()\n",
    ")\n",
    "dfg"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate the NetworkX Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(215,)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()\n",
    "nodes.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "G = nx.Graph()\n",
    "\n",
    "## Add nodes to the graph\n",
    "for node in nodes:\n",
    "    G.add_node(\n",
    "        str(node)\n",
    "    )\n",
    "\n",
    "## Add edges to the graph\n",
    "for index, row in dfg.iterrows():\n",
    "    G.add_edge(\n",
    "        str(row[\"node_1\"]),\n",
    "        str(row[\"node_2\"]),\n",
    "        title=row[\"edge\"],\n",
    "        weight=row['count']/4\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate communities for coloring the nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Communities =  17\n",
      "[['56 articles', 'analysis', \"corresponding authors' experiential knowledge\", 'extensive literature search', 'peer-reviewed journals'], ['[54]', '[55]', 'a bad situation', 'a worrisome new trend', 'adequately compensated', 'can reverse the situation', 'defensive medicine practices', 'increasing violence against healthcare personnel', 'intense focus on specialization', 'low physician-to-patient ratio', 'overwhelmed physicians', 'primary care physicians', 'private marketplace', 'protect themselves by ordering unnecessary tests and procedures', 'results in delays in attending patients', 'set in', 'tempted to take on more patients than they can reasonably serve', 'thoughtful approach to government planning', 'underpaid physicians', 'unethical practices by pharmaceutical companies', 'will not be able to solve this'], ['accredit health facilities', 'enforcement of existing rules', 'health insurance scheme for central government employees', 'health system standardization', 'implementation of land allocation conditions', 'medical tourism hub', 'new rules for reasonable costs and cap profit margins', 'private health sector regulation', 'private health sector utilization'], ['accurate data about the quantity and geospatial location of its manpower', \"central government's leadership in the fight for better health for all indians\", 'live register for health personnel and infrastructure'], ['adoption', 'affordable, accessible, quality care', 'clinical and social skills', 'corporate houses', 'deeper structural problems', 'doctor-to-population ratio', 'erratic posting of personnel', 'evaluations', 'for-profit private health sector', 'government', 'government-funded health sector', 'health infrastructure', 'health personnel', 'health system', 'implementation', 'individualized mentoring', 'medical tourism', 'narayana health', 'national health mission (nhm)', 'non-hierarchical work environment', 'online training', 'out-of-pocket (oop) expenditure', 'out-of-pocket expenditure', 'overarching', 'patients', 'personnel retention', 'physicians', 'policy', 'primary care', 'private health sector', 'providing ongoing training', 'public and private health sectors', 'public health sector', 'public hospital beds', 'quality care', 'quality of care', 'quantum of services provided', 'recommendations made here', 'regulation', 'retaining', 'service provision', 'services they have been trained for', 'sourcing health personnel', 'strengthening', 'tier ii and iii cities', 'training', 'training initiative', 'transparency', 'transparency in creating training schedules', 'uniform standards', 'unmet demand for healthcare services', 'upgrading skills of nurses', 'vulnerable populations', 'well-equipped personnel', 'work environment', \"world health organization's recommended\"], ['ai-embedded logarithms now diagnose covid-19 disease from chest x-rays and ct scans', 'covid-19 pandemic catalyzing process and enabling delivery of healthcare', 'doctors can now perform an ophthalmic fundal examination online', 'drones being used to deliver medicines to communities', 'expanding application of technology to other portfolios within the health sector', 'innovations in digital technology supporting delivery of vital healthcare in high-income countries', 'manufacturing protective equipment locally using 3d printing', 'remote orthopedic examinations being used successfully', 'telemedicine for consultations with healthcare providers', 'uptake of telemedicine during covid-19 pandemic'], ['allocated budget to healthcare', 'ashas', 'ayush doctors', 'ayushman bharat', 'cadres', 'communities', 'communitization', 'contractors', 'contracts', 'contractual employees', 'contractual personnel', 'delayed', 'demotivation', 'doctors', 'economic norms', 'epidemiology/public health', 'financially protected from catastrophic health expenses', 'flexible financing', 'gdp', 'government of india', 'health policy', 'health sector reform', 'healthcare facilities', 'high-income countries', 'improved management through capacity building', 'india', 'indian government', 'indian-born physicians', 'informal providers', 'infrastructure', 'innovations in human resource management', 'just over $30 billion', 'largest émigré physician workforce in the world', 'lower remuneration', 'monitoring progress against standards', 'nhm', 'nhm strategies', 'nurses', 'oop expenditures by consumers', 'other middle-income countries and its neighbors', 'poor', 'poor infrastructure', 'public health', 'public sector', 'publicly financed health insurance scheme for the poor', 'publicly financed purchasing of services from private providers', 'rural areas', 'salary payments', 'skilled health workforce', 'social norms', 'temporary nature', 'total expenditure on health in india', 'urban areas', 'urban slums', 'viability concerns', 'working conditions', 'world-class health facilities'], ['anm', 'asha', 'female community health workers', 'government employee', 'private contractor'], ['centralization', 'interplay between private corporate sector, pharmaceutical industries, medical education, and healthcare services', 'medical commission', 'medical education', 'national medical council (nmc)', 'nmc act', 'state', 'universal health coverage'], [\"constraints imposed by corporate healthcare sector on doctors' professional autonomy\", 'doctors gravitating towards private sector employment due to low government salaries', 'high fees of private sector medical colleges', 'higher number of medical colleges per population in india compared to other countries', 'highest employer of doctors in india', 'highly paid doctors graduating from private sector medical colleges', 'low fees of government medical colleges', 'majority of young and early career doctors facing erosion of status and opportunities in the private healthcare sector', \"performance targets and practice constraints on doctors' professional autonomy in the corporate healthcare sector\", 'private sector healthcare employment', 'private sector investment in medical colleges', 'profitable medical colleges', 'star doctors with flourishing practices in the private healthcare sector'], ['digital technology', 'evin', 'medical devices', 'online training management information systems', 'wearable, trackable technology'], ['doctors and nurses/midwives', 'health workers density'], ['federal', 'health budget'], [\"india's health indicators\", 'methodology used', 'national medical council', 'peer nations', 'private sector', 'public-private sector divide', 'read approach', 'recent increase in the federal health budget', 'skewed inter-state', 'skilled personnel', 'urban-rural', 'who recommended thresholds'], ['initial evaluation', 'primary health centers', 'rural medical assistants (rmas)'], ['limited uptake', 'national health protection mission'], ['private health sector systems', 'public']]\n"
     ]
    }
   ],
   "source": [
    "communities_generator = nx.community.girvan_newman(G)\n",
    "top_level_communities = next(communities_generator)\n",
    "next_level_communities = next(communities_generator)\n",
    "communities = sorted(map(sorted, next_level_communities))\n",
    "print(\"Number of Communities = \", len(communities))\n",
    "print(communities)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create a dataframe for community colors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node</th>\n",
       "      <th>color</th>\n",
       "      <th>group</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>56 articles</td>\n",
       "      <td>#db57db</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>analysis</td>\n",
       "      <td>#db57db</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>corresponding authors' experiential knowledge</td>\n",
       "      <td>#db57db</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>extensive literature search</td>\n",
       "      <td>#db57db</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>peer-reviewed journals</td>\n",
       "      <td>#db57db</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>210</th>\n",
       "      <td>rural medical assistants (rmas)</td>\n",
       "      <td>#57bcdb</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>211</th>\n",
       "      <td>limited uptake</td>\n",
       "      <td>#db57ac</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>212</th>\n",
       "      <td>national health protection mission</td>\n",
       "      <td>#db57ac</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>213</th>\n",
       "      <td>private health sector systems</td>\n",
       "      <td>#57dbcc</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>214</th>\n",
       "      <td>public</td>\n",
       "      <td>#57dbcc</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>215 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              node    color  group\n",
       "0                                      56 articles  #db57db      1\n",
       "1                                         analysis  #db57db      1\n",
       "2    corresponding authors' experiential knowledge  #db57db      1\n",
       "3                      extensive literature search  #db57db      1\n",
       "4                           peer-reviewed journals  #db57db      1\n",
       "..                                             ...      ...    ...\n",
       "210                rural medical assistants (rmas)  #57bcdb     15\n",
       "211                                 limited uptake  #db57ac     16\n",
       "212             national health protection mission  #db57ac     16\n",
       "213                  private health sector systems  #57dbcc     17\n",
       "214                                         public  #57dbcc     17\n",
       "\n",
       "[215 rows x 3 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import seaborn as sns\n",
    "palette = \"hls\"\n",
    "\n",
    "## Now add these colors to communities and make another dataframe\n",
    "def colors2Community(communities) -> pd.DataFrame:\n",
    "    ## Define a color palette\n",
    "    p = sns.color_palette(palette, len(communities)).as_hex()\n",
    "    random.shuffle(p)\n",
    "    rows = []\n",
    "    group = 0\n",
    "    for community in communities:\n",
    "        color = p.pop()\n",
    "        group += 1\n",
    "        for node in community:\n",
    "            rows += [{\"node\": node, \"color\": color, \"group\": group}]\n",
    "    df_colors = pd.DataFrame(rows)\n",
    "    return df_colors\n",
    "\n",
    "\n",
    "colors = colors2Community(communities)\n",
    "colors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Add colors to the graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "for index, row in colors.iterrows():\n",
    "    G.nodes[row['node']]['group'] = row['group']\n",
    "    G.nodes[row['node']]['color'] = row['color']\n",
    "    G.nodes[row['node']]['size'] = G.degree[row['node']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./docs/index.html\n"
     ]
    }
   ],
   "source": [
    "from pyvis.network import Network\n",
    "\n",
    "graph_output_directory = \"./docs/index.html\"\n",
    "\n",
    "net = Network(\n",
    "    notebook=False,\n",
    "    # bgcolor=\"#1a1a1a\",\n",
    "    cdn_resources=\"remote\",\n",
    "    height=\"900px\",\n",
    "    width=\"100%\",\n",
    "    select_menu=True,\n",
    "    # font_color=\"#cccccc\",\n",
    "    filter_menu=False,\n",
    ")\n",
    "\n",
    "net.from_nx(G)\n",
    "# net.repulsion(node_distance=150, spring_length=400)\n",
    "net.force_atlas_2based(central_gravity=0.015, gravity=-31)\n",
    "# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)\n",
    "net.show_buttons(filter_=[\"physics\"])\n",
    "\n",
    "net.show(graph_output_directory, notebook=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "OpenAI@3111",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}