Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

feat: add csv feature to extract_tables #79

Merged
merged 3 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 53 additions & 2 deletions any_parser/any_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import json
import time
import uuid
from collections.abc import Iterable
from io import StringIO
from pathlib import Path

import requests
Expand Down Expand Up @@ -184,26 +186,75 @@ def extract_pii(
file_type=file_type,
)

@staticmethod
def flatten_to_string(item):
"""
Flatten any iterable object to a string.
"""

if isinstance(item, str):
return item

# if item is a dict, flatten all keys and values
if isinstance(item, dict):
parts = []
for k, v in item.items():
parts.append(AnyParser.flatten_to_string(k))
parts.append(AnyParser.flatten_to_string(v))
return "".join(parts)

# item is other iterable objects
if isinstance(item, Iterable):
parts = []
for sub_item in item:
parts.append(AnyParser.flatten_to_string(sub_item))
return "".join(parts)

# item is not iterable objects
return str(item)

@handle_file_processing
def extract_tables(
self,
file_path=None,
file_content=None,
file_type=None,
return_type="html",
):
"""Extract tables from a file in real-time.

Args:
file_path (str): The path to the file to be parsed.
return_type (str): 'html' or 'csv'
Returns:
tuple(str, str): The extracted data and the time taken.
tuple(str, str)
"""
return self._sync_extract_tables.extract(
extracted_html, time_elapsed = self._sync_extract_tables.extract(
file_path=file_path,
file_content=file_content,
file_type=file_type,
)

if isinstance(extracted_html, list):
extracted_html = AnyParser.flatten_to_string(extracted_html)

if return_type.lower() == "csv":
try:
import pandas as pd
except ImportError:
raise ImportError("Please install pandas to use CSV return_type")

if isinstance(extracted_html, list):
extracted_html = "".join(str(item) for item in extracted_html)

df_list = pd.read_html(StringIO(extracted_html))
combined_df = pd.concat(df_list, ignore_index=True)
csv_output = combined_df.to_csv(index=False)

return csv_output, time_elapsed

return extracted_html, time_elapsed

@handle_file_processing
def extract_key_value(
self,
Expand Down
69 changes: 54 additions & 15 deletions examples/extract_tables.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -15,15 +15,23 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/ubuntu/any-parser/any_parser/__init__.py\n"
]
}
],
"source": [
"from IPython.display import display, Markdown\n",
"from any_parser import AnyParser"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -36,8 +44,13 @@
"metadata": {},
"outputs": [],
"source": [
"file_path = \"./sample_data/test_1figure_1table.png\"\n",
"html_output, time = ap.extract_tables(file_path)"
"csv_output, time_info = ap.extract_tables(\n",
" file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"csv\"\n",
")\n",
"\n",
"html_output, time_info = ap.extract_tables(\n",
" file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"html\"\n",
")"
]
},
{
Expand All @@ -46,14 +59,12 @@
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Time Elapsed: 3.97 seconds'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2 μs, sys: 0 ns, total: 2 μs\n",
"Wall time: 5.25 μs\n"
]
}
],
"source": [
Expand All @@ -62,9 +73,31 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"0,1,2\n",
",latency,(ms)\n",
"participants,mean,99th percentile\n",
"1,17.0 +1.4,75.0 34.9\n",
"2,24.5 +2.5,87.6 35.9\n",
"5,31.5 +6.2,104.5 52.2\n",
"10,30.0 +3.7,95.6 25.4\n",
"25,35.5 +5.6,100.4 42.7\n",
"50,42.7 +4.1,93.7 22.9\n",
"100,71.4 +7.6,131.2 +17.6\n",
"200,150.5 +11.0,320.3 35.1\n"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
Expand Down Expand Up @@ -93,6 +126,12 @@
}
],
"source": [
"if isinstance(csv_output, list):\n",
" csv_output_str = \"\\n\".join(csv_output)\n",
"else:\n",
" csv_output_str = csv_output\n",
"\n",
"display(Markdown(csv_output_str))\n",
"display(Markdown(html_output))"
]
}
Expand All @@ -113,7 +152,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "-1.-1.-1"
"version": "3.12.2"
}
},
"nbformat": 4,
Expand Down
Loading