Skip to content

Commit

Permalink
Update example
Browse files Browse the repository at this point in the history
  • Loading branch information
ppinchuk committed Feb 12, 2025
1 parent dcc9697 commit 105fc02
Showing 1 changed file with 20 additions and 93 deletions.
113 changes: 20 additions & 93 deletions examples/web_scraping_pipeline/example_scrape_wiki.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -52,91 +52,23 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Failed to decode PDF content!\n",
"poppler error creating document\n",
"Traceback (most recent call last):\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 376, in read_pdf\n",
" pages = _load_pdf_possibly_multi_col(pdf_bytes)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 391, in _load_pdf_possibly_multi_col\n",
" pages = pdftotext.PDF(pdf_bytes, physical=True)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
"pdftotext.Error: poppler error creating document\n",
"Failed to decode PDF content!\n",
"poppler error creating document\n",
"Traceback (most recent call last):\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 376, in read_pdf\n",
" pages = _load_pdf_possibly_multi_col(pdf_bytes)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 391, in _load_pdf_possibly_multi_col\n",
" pages = pdftotext.PDF(pdf_bytes, physical=True)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
"pdftotext.Error: poppler error creating document\n",
"Failed to decode PDF content!\n",
"poppler error creating document\n",
"Traceback (most recent call last):\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 376, in read_pdf\n",
" pages = _load_pdf_possibly_multi_col(pdf_bytes)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 391, in _load_pdf_possibly_multi_col\n",
" pages = pdftotext.PDF(pdf_bytes, physical=True)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
"pdftotext.Error: poppler error creating document\n",
"Failed to decode PDF content!\n",
"poppler error creating document\n",
"Traceback (most recent call last):\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 376, in read_pdf\n",
" pages = _load_pdf_possibly_multi_col(pdf_bytes)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 391, in _load_pdf_possibly_multi_col\n",
" pages = pdftotext.PDF(pdf_bytes, physical=True)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
"pdftotext.Error: poppler error creating document\n",
"Failed to decode PDF content!\n",
"poppler error creating document\n",
"Traceback (most recent call last):\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 376, in read_pdf\n",
" pages = _load_pdf_possibly_multi_col(pdf_bytes)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 391, in _load_pdf_possibly_multi_col\n",
" pages = pdftotext.PDF(pdf_bytes, physical=True)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
"pdftotext.Error: poppler error creating document\n",
"Failed to decode PDF content!\n",
"poppler error creating document\n",
"Traceback (most recent call last):\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 376, in read_pdf\n",
" pages = _load_pdf_possibly_multi_col(pdf_bytes)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/Users/rolson2/GitHub/rolson2/elm/elm/utilities/parse.py\", line 391, in _load_pdf_possibly_multi_col\n",
" pages = pdftotext.PDF(pdf_bytes, physical=True)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
"pdftotext.Error: poppler error creating document\n",
"Found incompatible number of HTML (5) and parsed (6) tables! No replacement performed.\n",
"Found incompatible number of HTML (3) and parsed (1) tables! No replacement performed.\n"
]
}
],
"outputs": [],
"source": [
"from elm.web.search import web_search_links_as_docs\n",
"\n",
"\n",
"docs = await web_search_links_as_docs(QUERIES)"
"docs = await web_search_links_as_docs(\n",
" QUERIES,\n",
" pdf_read_kwargs={\"verbose\": False},\n",
" ignore_url_parts={\"openei.org\"},\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Don't worry about the error messages (if any). The messages are emitted because ELM attempts to download every URL as a PDF. If that is not possible, a message is logged, but ELM falls back on reading HTML from the URL instead (which is probably what you want).\n",
"\n",
"We can check `docs` to see that we indeed got some google search results:"
]
},
Expand All @@ -148,12 +80,10 @@
{
"data": {
"text/plain": [
"[<elm.web.document.HTMLDocument at 0x138482450>,\n",
" <elm.web.document.HTMLDocument at 0x1386a9d10>,\n",
" <elm.web.document.HTMLDocument at 0x138d73490>,\n",
" <elm.web.document.HTMLDocument at 0x138482910>,\n",
" <elm.web.document.HTMLDocument at 0x136255e50>,\n",
" <elm.web.document.HTMLDocument at 0x138a07350>]"
"[<elm.web.document.HTMLDocument at 0x7f8ce47f7b30>,\n",
" <elm.web.document.HTMLDocument at 0x7f8ce4406a20>,\n",
" <elm.web.document.HTMLDocument at 0x7f8ce41eb770>,\n",
" <elm.web.document.HTMLDocument at 0x7f8ce416a540>]"
]
},
"execution_count": 3,
Expand Down Expand Up @@ -182,12 +112,10 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': 'https://www.nrel.gov/about/'}\n",
"{'source': 'https://en.wikipedia.org/wiki/National_Renewable_Energy_Laboratory'}\n",
"{'source': 'https://www.nrel.gov/about/leadership.html'}\n",
"{'source': 'https://en.wikipedia.org/wiki/United_States_Department_of_Energy_National_Laboratories'}\n",
"{'source': 'https://openei.org/wiki/NREL'}\n",
"{'source': 'https://www.nrel.gov/about/director.html'}\n"
"{'source': 'https://en.wikipedia.org/?title=National_Renewable_Energy_Lab&redirect=no'}\n",
"{'source': 'https://www.energy.gov/person/dr-martin-keller#:~:text=Martin%20Keller-,Dr.,Alliance%20for%20Sustainable%20Energy%2C%20LLC.'}\n",
"{'source': 'https://www2.nrel.gov/about/leadership'}\n",
"{'source': 'https://www.linkedin.com/in/martin-keller-a09b016'}\n"
]
}
],
Expand All @@ -214,7 +142,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -233,20 +161,19 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': 'https://en.wikipedia.org/wiki/National_Renewable_Energy_Laboratory'}\n",
"{'source': 'https://en.wikipedia.org/wiki/United_States_Department_of_Energy_National_Laboratories'}\n"
"{'source': 'https://en.wikipedia.org/wiki/National_Renewable_Energy_Laboratory'}\n"
]
}
],
"source": [
"from elm.web.search.google import filter_documents\n",
"from elm.web.utilities import filter_documents\n",
"\n",
"docs = await filter_documents(docs, url_is_wiki)\n",
"for d in docs:\n",
Expand All @@ -272,7 +199,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand Down

0 comments on commit 105fc02

Please # to comment.