[#17] Cummulative value

akvo · Jan 21, 2025 · 33c208b · 33c208b
1 parent c0d5ed1
commit 33c208b
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 38 deletions.
diff --git a/src/main.ipynb b/src/main.ipynb
@@ -290,7 +290,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "final_columns = ['indicator','year','country','unit','value_name','jmp_category','commitment','value','base_value','initial_value','2030','2050']"
+    "final_columns = ['indicator','year','country','unit','value_name','jmp_category','commitment','value','base_value','initial_value','cumulative_value','2030','2050']"
    ]
   },
   {
@@ -374,7 +374,7 @@
     "            return \"BS\"\n",
     "        if \"Safely\" in x[\"2nd_dimension\"]:\n",
     "            return \"SM\"\n",
-    "        return np.nan\n",
+    "        return \"Base\"\n",
     "    return x[\"jmp_category\"]"
    ]
   },
@@ -666,7 +666,7 @@
     "            \"value\": value_list[\"value\"]\n",
     "        })\n",
     "    df = pd.DataFrame(new_data)\n",
-    "    df = filter_dataframe_by_year(df, file)\n",
+    "    \n",
     "    df_split = pd.DataFrame(df['value_type'].tolist(), index=df.index)\n",
     "    df_split.columns = ['value_name', 'jmp_category', 'commitment']\n",
     "    df_final = pd.concat([df, df_split], axis=1)\n",
@@ -676,6 +676,18 @@
     "    df_final['jmp_category'] = df_final['jmp_category'].replace({\"BS\": \"ALB\"})\n",
     "    \n",
     "    df_final['commitment'] = df_final.apply(modify_commitment_name, axis=1)\n",
+    "\n",
+    "    # Make sure that all value is numeric\n",
+    "    df_final['value'] = pd.to_numeric(df_final['value'], errors='coerce')\n",
+    "    df_final['value'] = df_final['value'].fillna(0)\n",
+    "    \n",
+    "    # Add cumulative column grouped by multiple columns\n",
+    "    df_final.to_csv(\"../tests/original_data.csv\", index=False)\n",
+    "    group_columns = ['country', 'jmp_category', 'commitment', 'indicator', 'value_name']\n",
+    "    df_final['cumulative_value'] = df_final.groupby(group_columns)['value'].cumsum()\n",
+    "    df_final['jmp_category'] = df_final['jmp_category'].replace('Base', np.nan)\n",
+    "    \n",
+    "    df_final = filter_dataframe_by_year(df_final, file)\n",
     "    # df_final.to_csv(\"testing-1.csv\",index=False)\n",
     "    # Add Value for ALB\n",
     "    if \"Water Service\" in file or \"Sanitation Service\" in file:\n",
@@ -805,6 +817,7 @@
        "      <th>commitment</th>\n",
        "      <th>value</th>\n",
        "      <th>base_value</th>\n",
+       "      <th>cumulative</th>\n",
        "      <th>initial_value</th>\n",
        "      <th>2030</th>\n",
        "      <th>2050</th>\n",
@@ -823,6 +836,7 @@
        "      <td>Base</td>\n",
        "      <td>1.237</td>\n",
        "      <td>NaN</td>\n",
+       "      <td>15.432</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -839,6 +853,7 @@
        "      <td>Base</td>\n",
        "      <td>1.143</td>\n",
        "      <td>NaN</td>\n",
+       "      <td>39.085</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -855,6 +870,7 @@
        "      <td>Full Sanitation Access in 2030</td>\n",
        "      <td>1.075</td>\n",
        "      <td>1.237</td>\n",
+       "      <td>14.357</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -871,6 +887,7 @@
        "      <td>Full Sanitation Access in 2030</td>\n",
        "      <td>1.064</td>\n",
        "      <td>1.143</td>\n",
+       "      <td>35.647</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -887,6 +904,7 @@
        "      <td>Full Sanitation Access in 2050</td>\n",
        "      <td>1.191</td>\n",
        "      <td>1.237</td>\n",
+       "      <td>15.116</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -911,12 +929,12 @@
        "3  Mil People         FS          ALB  Full Sanitation Access in 2030  1.064   \n",
        "4  Mil People         FS          ALB  Full Sanitation Access in 2050  1.191   \n",
        "\n",
-       "  base_value  initial_value  2030  2050  remove  \n",
-       "0        NaN            NaN   NaN   NaN   False  \n",
-       "1        NaN            NaN   NaN   NaN   False  \n",
-       "2      1.237            NaN   NaN   NaN   False  \n",
-       "3      1.143            NaN   NaN   NaN    True  \n",
-       "4      1.237            NaN   NaN   NaN    True  "
+       "   base_value  cumulative  initial_value  2030  2050  remove  \n",
+       "0         NaN      15.432            NaN   NaN   NaN   False  \n",
+       "1         NaN      39.085            NaN   NaN   NaN   False  \n",
+       "2       1.237      14.357            NaN   NaN   NaN   False  \n",
+       "3       1.143      35.647            NaN   NaN   NaN    True  \n",
+       "4       1.237      15.116            NaN   NaN   NaN    True  "
       ]
      },
      "execution_count": 32,
@@ -1799,6 +1817,7 @@
        "      <th>commitment</th>\n",
        "      <th>value</th>\n",
        "      <th>base_value</th>\n",
+       "      <th>cumulative</th>\n",
        "      <th>initial_value</th>\n",
        "      <th>2030</th>\n",
        "      <th>2050</th>\n",
@@ -1818,6 +1837,7 @@
        "      <td>6x</td>\n",
        "      <td>0.208</td>\n",
        "      <td>0.302</td>\n",
+       "      <td>2.964</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1835,6 +1855,7 @@
        "      <td>6x</td>\n",
        "      <td>0.143</td>\n",
        "      <td>0.178</td>\n",
+       "      <td>6.369</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1850,13 +1871,13 @@
        "36384  Malnourished Children, Headcount - Millions  2030  Zambia  Mil People   \n",
        "36385  Malnourished Children, Headcount - Millions  2050  Zambia  Mil People   \n",
        "\n",
-       "      value_name jmp_category commitment  value base_value  initial_value  \\\n",
-       "36384        WSI           SM         6x  0.208      0.302            NaN   \n",
-       "36385        WSI           SM         6x  0.143      0.178            NaN   \n",
+       "      value_name jmp_category commitment  value  base_value  cumulative  \\\n",
+       "36384        WSI           SM         6x  0.208       0.302       2.964   \n",
+       "36385        WSI           SM         6x  0.143       0.178       6.369   \n",
        "\n",
-       "       2030  2050  remove  jmp_name_id  \n",
-       "36384   NaN   NaN   False            3  \n",
-       "36385   NaN   NaN   False            3  "
+       "       initial_value  2030  2050  remove  jmp_name_id  \n",
+       "36384            NaN   NaN   NaN   False            3  \n",
+       "36385            NaN   NaN   NaN   False            3  "
       ]
      },
      "execution_count": 43,
@@ -1930,6 +1951,7 @@
        "      <th>year</th>\n",
        "      <th>value</th>\n",
        "      <th>base_value</th>\n",
+       "      <th>cumulative</th>\n",
        "      <th>initial_value</th>\n",
        "      <th>2030</th>\n",
        "      <th>2050</th>\n",
@@ -1949,6 +1971,7 @@
        "      <td>2050</td>\n",
        "      <td>7.761</td>\n",
        "      <td>8.036</td>\n",
+       "      <td>537.412</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1964,8 +1987,9 @@
        "    <tr>\n",
        "      <th>36382</th>\n",
        "      <td>2050</td>\n",
-       "      <td>16.6</td>\n",
+       "      <td>16.600</td>\n",
        "      <td>NaN</td>\n",
+       "      <td>348.138</td>\n",
        "      <td>5.53</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1983,6 +2007,7 @@
        "      <td>2050</td>\n",
        "      <td>9.581</td>\n",
        "      <td>4.591</td>\n",
+       "      <td>52.341</td>\n",
        "      <td>0.90</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -2000,6 +2025,7 @@
        "      <td>2050</td>\n",
        "      <td>7.209</td>\n",
        "      <td>8.036</td>\n",
+       "      <td>527.720</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -2017,6 +2043,7 @@
        "      <td>2050</td>\n",
        "      <td>0.143</td>\n",
        "      <td>0.178</td>\n",
+       "      <td>6.369</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -2034,26 +2061,26 @@
        "</div>"
       ],
       "text/plain": [
-       "       year  value base_value  initial_value  2030  2050  remove  jmp_name_id  \\\n",
-       "36381  2050  7.761      8.036            NaN   NaN   NaN   False            3   \n",
-       "36382  2050   16.6        NaN           5.53   NaN   NaN   False            1   \n",
-       "36383  2050  9.581      4.591           0.90   NaN   NaN   False            2   \n",
-       "36384  2050  7.209      8.036            NaN   NaN   NaN   False            1   \n",
-       "36385  2050  0.143      0.178            NaN   NaN   NaN   False            3   \n",
+       "       year   value  base_value  cumulative  initial_value  2030  2050  \\\n",
+       "36381  2050   7.761       8.036     537.412            NaN   NaN   NaN   \n",
+       "36382  2050  16.600         NaN     348.138           5.53   NaN   NaN   \n",
+       "36383  2050   9.581       4.591      52.341           0.90   NaN   NaN   \n",
+       "36384  2050   7.209       8.036     527.720            NaN   NaN   NaN   \n",
+       "36385  2050   0.143       0.178       6.369            NaN   NaN   NaN   \n",
        "\n",
-       "       indicator_id  unit_id  value_name_id  jmp_category_id  commitment_id  \\\n",
-       "36381             5        4              7                1              1   \n",
-       "36382            12        5              1                2              5   \n",
-       "36383             6        5              2                1              7   \n",
-       "36384             5        4              6                2              4   \n",
-       "36385             4        4              7                2              4   \n",
+       "       remove  jmp_name_id  indicator_id  unit_id  value_name_id  \\\n",
+       "36381   False            3             5        4              7   \n",
+       "36382   False            1            12        5              1   \n",
+       "36383   False            2             6        5              2   \n",
+       "36384   False            1             5        4              6   \n",
+       "36385   False            3             4        4              7   \n",
        "\n",
-       "       country_id  \n",
-       "36381          22  \n",
-       "36382          15  \n",
-       "36383          10  \n",
-       "36384          22  \n",
-       "36385          23  "
+       "       jmp_category_id  commitment_id  country_id  \n",
+       "36381                1              1          22  \n",
+       "36382                2              5          15  \n",
+       "36383                1              7          10  \n",
+       "36384                2              4          22  \n",
+       "36385                2              4          23  "
       ]
      },
      "execution_count": 45,

diff --git a/src/main.py b/src/main.py
@@ -165,7 +165,7 @@ def map_country_name(country):
 # In[13]:
 
 
-final_columns = ['indicator','year','country','unit','value_name','jmp_category','commitment','value','base_value','initial_value','2030','2050']
+final_columns = ['indicator','year','country','unit','value_name','jmp_category','commitment','value','base_value','initial_value','cumulative_value','2030','2050']
 
 
 # In[14]:
@@ -283,7 +283,7 @@ def remove_unmatches_jmp_category(x):
         if x["2nd_dimension"] == "SafelyManaged" and x["jmp_category"] == "ALB":
             return True
         if x["2nd_dimension"] == "SafelyManaged" and x["jmp_category"] == "BS":
-            return True
+            return "Base"
     return False
 
 
@@ -441,16 +441,28 @@ def populate_data():
             "value": value_list["value"]
         })
     df = pd.DataFrame(new_data)
-    df = filter_dataframe_by_year(df, file)
+
     df_split = pd.DataFrame(df['value_type'].tolist(), index=df.index)
     df_split.columns = ['value_name', 'jmp_category', 'commitment']
     df_final = pd.concat([df, df_split], axis=1)
 
     df_final['indicator'] = get_ifs_name(file)
     df_final['jmp_category'] = df_final.apply(base_jmp_category, axis=1)
     df_final['jmp_category'] = df_final['jmp_category'].replace({"BS": "ALB"})
-    
+
     df_final['commitment'] = df_final.apply(modify_commitment_name, axis=1)
+
+    # Make sure that all value is numeric
+    df_final['value'] = pd.to_numeric(df_final['value'], errors='coerce')
+    df_final['value'] = df_final['value'].fillna(0)
+
+    # Add cumulative column grouped by multiple columns
+    df_final.to_csv("../tests/original_data.csv", index=False)
+    group_columns = ['country', 'jmp_category', 'commitment', 'indicator', 'value_name']
+    df_final['cumulative_value'] = df_final.groupby(group_columns)['value'].cumsum()
+    df_final['jmp_category'] = df_final['jmp_category'].replace('Base', np.nan)
+
+    df_final = filter_dataframe_by_year(df_final, file)
     # df_final.to_csv("testing-1.csv",index=False)
     # Add Value for ALB
     if "Water Service" in file or "Sanitation Service" in file: