clean up notebook

PacktPublishing · Dec 19, 2024 · ee779db · ee779db
1 parent 29354c2
commit ee779db
Show file tree

Hide file tree

Showing 3 changed files with 239 additions and 74 deletions.
diff --git a/ch8/named-entity-recognition/ner-from-web.py b/ch8/named-entity-recognition/ner-from-web.py
@@ -0,0 +1,140 @@
+# %%
+from haystack import Pipeline
+from haystack.components.preprocessors import DocumentCleaner
+from haystack.components.websearch import SerperDevWebSearch
+from haystack.components.fetchers import LinkContentFetcher
+from haystack.components.converters import HTMLToDocument
+from haystack.components.writers import DocumentWriter
+from haystack import Pipeline
+from haystack.components.extractors import NamedEntityExtractor
+from haystack import component, Document
+from typing import Any, Dict, List, Union
+
+from dotenv import load_dotenv
+import os
+
+load_dotenv(".env")
+open_ai_key = os.getenv("OPENAI_API_KEY")
+serper_api_key = os.getenv("SERPERDEV_API_KEY")
+
+
+# %% [markdown]
+# ### Define custom component
+
+# %%
+@component
+class NERPopulator():
+    """This function extracts named entities from a list of
+    documents and returns the result in a structured format.
+
+    Args:
+        documents (list): List of Haystack Document objects
+
+    Returns:
+        extracted_data (list): A list of dictionaries containing the extracted entities, 
+        to make it Haystack-compatible we will return this list as a dictionary with the key 'documents'
+    """
+
+    @component.output_types(documents=List[Document])
+    def run(self, sources: List[Document]) -> None:
+        extracted_data = []
+
+        for document in sources:
+            content = document.content
+            doc_id = document.id
+            named_entities = document.meta.get('named_entities', [])
+            url = document.meta.get('url', 'N/A')  # Default to 'N/A' if URL is not available
+
+            # Sets to store unique entities by type
+            entities_by_type = {
+                "LOC": set(),
+                "PER": set(),
+                "ORG": set(),
+                "MISC": set()
+            }
+
+            # Loop through the entities and filter by score and type
+            for entity in named_entities:
+                if float(entity.score) < 0.8:
+                    continue
+
+                word = content[entity.start:entity.end]
+                if entity.entity in entities_by_type:
+                    entities_by_type[entity.entity].add(word)  # Use set to ensure uniqueness
+
+            # Prepare the meta field with comma-separated values
+            meta = {
+                "LOC": ",".join(entities_by_type["LOC"]),
+                "PER": ",".join(entities_by_type["PER"]),
+                "ORG": ",".join(entities_by_type["ORG"]),
+                "MISC": ",".join(entities_by_type["MISC"]),
+                "url": url
+            }
+
+            # Append the result for this document
+            extracted_data.append({
+                'document_id': doc_id,
+                'content': content,
+                'meta': meta
+            })
+
+
+        return {"documents": extracted_data}
+
+
+# %% [markdown]
+# ### Build Haystack pipeline with custom component
+
+# %%
+
+# Initialize pipeline
+pipeline = Pipeline()
+web_search = SerperDevWebSearch(top_k=5,
+                                allowed_domains=["https://www.britannica.com/"])
+link_content = LinkContentFetcher(retry_attempts=3,
+                                  timeout=10)
+html_to_doc = HTMLToDocument()
+document_cleaner = DocumentCleaner(
+                                remove_empty_lines=True,
+                                remove_extra_whitespaces=True,
+                                remove_repeated_substrings=False,
+                                remove_substrings=['\n-']
+                            )
+extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")
+extractor.warm_up()
+
+ner_component = NERPopulator()
+
+# Add components
+pipeline.add_component(name='search', instance=web_search)
+pipeline.add_component(name ='fetcher' , instance= link_content)
+pipeline.add_component(name='htmldocument', instance=html_to_doc)
+pipeline.add_component(name='cleaner', instance=document_cleaner)
+pipeline.add_component(name='extractor', instance=extractor)
+pipeline.add_component(name='ner', instance=ner_component)
+
+# Connect components to one another
+pipeline.connect("search.links", "fetcher.urls")
+pipeline.connect("fetcher", "htmldocument")
+pipeline.connect("htmldocument", "cleaner")
+pipeline.connect("cleaner", "extractor")
+pipeline.connect("extractor", "ner")
+
+
+# %% [markdown]
+# ### Use pipeline to search Encyclopedia Britannica for all articles related to Elon Musk and extract entities
+
+# %%
+query = "Elon Musk"
+output = pipeline.run(data={"search":{"query":query}})
+
+# %%
+extracted_documents = output['ner']['documents']
+
+# %%
+extracted_documents
+
+# %%
+
+
+
diff --git a/ch8/named-entity-recognition/ner-with-haystack-news.ipynb b/ch8/named-entity-recognition/ner-with-haystack-news.ipynb
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 67,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -111,6 +111,13 @@
     "extractor.warm_up()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read the data and apply the extractor"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -125,6 +132,13 @@
     "extractor.run(documents)\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Parse extracted entities and store "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -133,13 +147,45 @@
    "source": [
     "# Extract named entities from the documents\n",
     "extracted_documents = extract_named_entities_with_ids(documents)\n",
-    "df = pd.DataFrame(extracted_documents)\n",
-    "df.to_csv(\"ner_output.csv\", index=False)"
+    "df = pd.DataFrame(extracted_documents)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'document_id': '0',\n",
+       " 'content': 'Budget to set scene for election\\n \\n Gordon Brown will seek to put the economy at the centre of Labour\\'s bid for a third term in power when he delivers his ninth Budget at 1230 GMT. He is expected to stress the importance of continued economic stability, with low unemployment and interest rates. The chancellor is expected to freeze petrol duty and raise the stamp duty threshold from Â£60,000. But the Conservatives and Lib Dems insist voters face higher taxes and more means-testing under Labour.\\n \\n Treasury officials have said there will not be a pre-election giveaway, but Mr Brown is thought to have about Â£2bn to spare.\\n \\n - Increase in the stamp duty threshold from Â£60,000 \\n  - A freeze on petrol duty \\n  - An extension of tax credit scheme for poorer families \\n  - Possible help for pensioners The stamp duty threshold rise is intended to help first time buyers - a likely theme of all three of the main parties\\' general election manifestos. Ten years ago, buyers had a much greater chance of avoiding stamp duty, with close to half a million properties, in England and Wales alone, selling for less than Â£60,000. Since then, average UK property prices have more than doubled while the starting threshold for stamp duty has not increased. Tax credits As a result, the number of properties incurring stamp duty has rocketed as has the government\\'s tax take. The Liberal Democrats unveiled their own proposals to raise the stamp duty threshold to Â£150,000 in February.\\n \\n The Tories are also thought likely to propose increased thresholds, with shadow chancellor Oliver Letwin branding stamp duty a \"classic Labour stealth tax\". The Tories say whatever the chancellor gives away will be clawed back in higher taxes if Labour is returned to power. Shadow Treasury chief secretary George Osborne said: \"Everyone who looks at the British economy at the moment says there has been a sharp deterioration in the public finances, that there is a black hole,\" he said. \"If Labour is elected there will be a very substantial tax increase in the Budget after the election, of the order of around Â£10bn.\"\\n \\n But Mr Brown\\'s former advisor Ed Balls, now a parliamentary hopeful, said an examination of Tory plans for the economy showed there would be a Â£35bn difference in investment by the end of the next parliament between the two main parties. He added: \"I don\\'t accept there is any need for any changes to the plans we have set out to meet our spending commitments.\"\\n \\n For the Lib Dems David Laws said: \"The chancellor will no doubt tell us today how wonderfully the economy is doing,\" he said. \"But a lot of that is built on an increase in personal and consumer debt over the last few years - that makes the economy quite vulnerable potentially if interest rates ever do have to go up in a significant way.\" SNP leader Alex Salmond said his party would introduce a Â£2,000 grant for first time buyers, reduce corporation tax and introduce a citizens pension free from means testing. Plaid Cymru\\'s economics spokesman Adam Price said he wanted help to get people on the housing ladder and an increase in the minimum wage to Â£5.60 an hour.\\n',\n",
+       " 'meta': {'LOC': 'UK,England,Wales',\n",
+       "  'PER': 'George Osborne,Ed,Oliver Letwin,Gordon Brown,Brown',\n",
+       "  'ORG': 'Labour,Shadow Treasury,Treasury'}}"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "extracted_documents[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['content'] = df['content'].str.replace(\"\\n\",\" \")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 65,
    "metadata": {},
    "outputs": [
     {
@@ -172,111 +218,69 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0</td>\n",
-       "      <td>Budget to set scene for election\\n \\n Gordon B...</td>\n",
+       "      <td>Budget to set scene for election    Gordon Bro...</td>\n",
        "      <td>{'LOC': 'UK,England,Wales', 'PER': 'George Osb...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>1</td>\n",
-       "      <td>Army chiefs in regiments decision\\n \\n Militar...</td>\n",
+       "      <td>Army chiefs in regiments decision    Military ...</td>\n",
        "      <td>{'LOC': 'Scotland,Iraq', 'PER': 'Eric,Joyce,Ge...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>2</td>\n",
-       "      <td>Howard denies split over ID cards\\n \\n Michael...</td>\n",
+       "      <td>Howard denies split over ID cards    Michael H...</td>\n",
        "      <td>{'LOC': '', 'PER': 'Davis,Ye,Michael Howard,Ti...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>3</td>\n",
-       "      <td>Observers to monitor UK election\\n \\n Minister...</td>\n",
+       "      <td>Observers to monitor UK election    Ministers ...</td>\n",
        "      <td>{'LOC': 'Britain,UK,Northern Ireland', 'PER': ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>4</td>\n",
-       "      <td>Kilroy names election seat target\\n \\n Ex-chat...</td>\n",
+       "      <td>Kilroy names election seat target    Ex-chat s...</td>\n",
        "      <td>{'LOC': 'UK,Derbyshire,London,Erewash,Nottingh...</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2220</th>\n",
-       "      <td>2220</td>\n",
-       "      <td>India opens skies to competition\\n \\n India wi...</td>\n",
-       "      <td>{'LOC': 'Saudi Arabia,India,Gulf,US,Kuwait', '...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2221</th>\n",
-       "      <td>2221</td>\n",
-       "      <td>Yukos bankruptcy 'not US matter'\\n \\n Russian ...</td>\n",
-       "      <td>{'LOC': 'Russia,sk,Gibraltar,US,Houston,Europe...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2222</th>\n",
-       "      <td>2222</td>\n",
-       "      <td>Survey confirms property slowdown\\n \\n Governm...</td>\n",
-       "      <td>{'LOC': 'Wales,UK,Greater London,England', 'PE...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2223</th>\n",
-       "      <td>2223</td>\n",
-       "      <td>High fuel prices hit BA's profits\\n \\n British...</td>\n",
-       "      <td>{'LOC': '', 'PER': 'Martin Broughton,Mike Powe...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2224</th>\n",
-       "      <td>2224</td>\n",
-       "      <td>US trade gap hits record in 2004\\n \\n The gap ...</td>\n",
-       "      <td>{'LOC': 'US,America,China', 'PER': 'Bush', 'OR...</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>2225 rows × 3 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "     document_id                                            content  \\\n",
-       "0              0  Budget to set scene for election\\n \\n Gordon B...   \n",
-       "1              1  Army chiefs in regiments decision\\n \\n Militar...   \n",
-       "2              2  Howard denies split over ID cards\\n \\n Michael...   \n",
-       "3              3  Observers to monitor UK election\\n \\n Minister...   \n",
-       "4              4  Kilroy names election seat target\\n \\n Ex-chat...   \n",
-       "...          ...                                                ...   \n",
-       "2220        2220  India opens skies to competition\\n \\n India wi...   \n",
-       "2221        2221  Yukos bankruptcy 'not US matter'\\n \\n Russian ...   \n",
-       "2222        2222  Survey confirms property slowdown\\n \\n Governm...   \n",
-       "2223        2223  High fuel prices hit BA's profits\\n \\n British...   \n",
-       "2224        2224  US trade gap hits record in 2004\\n \\n The gap ...   \n",
-       "\n",
-       "                                                   meta  \n",
-       "0     {'LOC': 'UK,England,Wales', 'PER': 'George Osb...  \n",
-       "1     {'LOC': 'Scotland,Iraq', 'PER': 'Eric,Joyce,Ge...  \n",
-       "2     {'LOC': '', 'PER': 'Davis,Ye,Michael Howard,Ti...  \n",
-       "3     {'LOC': 'Britain,UK,Northern Ireland', 'PER': ...  \n",
-       "4     {'LOC': 'UK,Derbyshire,London,Erewash,Nottingh...  \n",
-       "...                                                 ...  \n",
-       "2220  {'LOC': 'Saudi Arabia,India,Gulf,US,Kuwait', '...  \n",
-       "2221  {'LOC': 'Russia,sk,Gibraltar,US,Houston,Europe...  \n",
-       "2222  {'LOC': 'Wales,UK,Greater London,England', 'PE...  \n",
-       "2223  {'LOC': '', 'PER': 'Martin Broughton,Mike Powe...  \n",
-       "2224  {'LOC': 'US,America,China', 'PER': 'Bush', 'OR...  \n",
+       "  document_id                                            content  \\\n",
+       "0           0  Budget to set scene for election    Gordon Bro...   \n",
+       "1           1  Army chiefs in regiments decision    Military ...   \n",
+       "2           2  Howard denies split over ID cards    Michael H...   \n",
+       "3           3  Observers to monitor UK election    Ministers ...   \n",
+       "4           4  Kilroy names election seat target    Ex-chat s...   \n",
        "\n",
-       "[2225 rows x 3 columns]"
+       "                                                meta  \n",
+       "0  {'LOC': 'UK,England,Wales', 'PER': 'George Osb...  \n",
+       "1  {'LOC': 'Scotland,Iraq', 'PER': 'Eric,Joyce,Ge...  \n",
+       "2  {'LOC': '', 'PER': 'Davis,Ye,Michael Howard,Ti...  \n",
+       "3  {'LOC': 'Britain,UK,Northern Ireland', 'PER': ...  \n",
+       "4  {'LOC': 'UK,Derbyshire,London,Erewash,Nottingh...  "
       ]
      },
-     "execution_count": 57,
+     "execution_count": 65,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df"
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(\"ner_output.csv\", index=False)"
    ]
   },
   {