Skip to content

Commit

Permalink
clean up notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
lfunderburk committed Dec 19, 2024
1 parent 29354c2 commit ee779db
Show file tree
Hide file tree
Showing 3 changed files with 239 additions and 74 deletions.
140 changes: 140 additions & 0 deletions ch8/named-entity-recognition/ner-from-web.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# %%
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.websearch import SerperDevWebSearch
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.writers import DocumentWriter
from haystack import Pipeline
from haystack.components.extractors import NamedEntityExtractor
from haystack import component, Document
from typing import Any, Dict, List, Union

from dotenv import load_dotenv
import os

load_dotenv(".env")
open_ai_key = os.getenv("OPENAI_API_KEY")
serper_api_key = os.getenv("SERPERDEV_API_KEY")


# %% [markdown]
# ### Define custom component

# %%
@component
class NERPopulator():
"""This function extracts named entities from a list of
documents and returns the result in a structured format.
Args:
documents (list): List of Haystack Document objects
Returns:
extracted_data (list): A list of dictionaries containing the extracted entities,
to make it Haystack-compatible we will return this list as a dictionary with the key 'documents'
"""

@component.output_types(documents=List[Document])
def run(self, sources: List[Document]) -> None:
extracted_data = []

for document in sources:
content = document.content
doc_id = document.id
named_entities = document.meta.get('named_entities', [])
url = document.meta.get('url', 'N/A') # Default to 'N/A' if URL is not available

# Sets to store unique entities by type
entities_by_type = {
"LOC": set(),
"PER": set(),
"ORG": set(),
"MISC": set()
}

# Loop through the entities and filter by score and type
for entity in named_entities:
if float(entity.score) < 0.8:
continue

word = content[entity.start:entity.end]
if entity.entity in entities_by_type:
entities_by_type[entity.entity].add(word) # Use set to ensure uniqueness

# Prepare the meta field with comma-separated values
meta = {
"LOC": ",".join(entities_by_type["LOC"]),
"PER": ",".join(entities_by_type["PER"]),
"ORG": ",".join(entities_by_type["ORG"]),
"MISC": ",".join(entities_by_type["MISC"]),
"url": url
}

# Append the result for this document
extracted_data.append({
'document_id': doc_id,
'content': content,
'meta': meta
})


return {"documents": extracted_data}


# %% [markdown]
# ### Build Haystack pipeline with custom component

# %%

# Initialize pipeline
pipeline = Pipeline()
web_search = SerperDevWebSearch(top_k=5,
allowed_domains=["https://www.britannica.com/"])
link_content = LinkContentFetcher(retry_attempts=3,
timeout=10)
html_to_doc = HTMLToDocument()
document_cleaner = DocumentCleaner(
remove_empty_lines=True,
remove_extra_whitespaces=True,
remove_repeated_substrings=False,
remove_substrings=['\n-']
)
extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")
extractor.warm_up()

ner_component = NERPopulator()

# Add components
pipeline.add_component(name='search', instance=web_search)
pipeline.add_component(name ='fetcher' , instance= link_content)
pipeline.add_component(name='htmldocument', instance=html_to_doc)
pipeline.add_component(name='cleaner', instance=document_cleaner)
pipeline.add_component(name='extractor', instance=extractor)
pipeline.add_component(name='ner', instance=ner_component)

# Connect components to one another
pipeline.connect("search.links", "fetcher.urls")
pipeline.connect("fetcher", "htmldocument")
pipeline.connect("htmldocument", "cleaner")
pipeline.connect("cleaner", "extractor")
pipeline.connect("extractor", "ner")


# %% [markdown]
# ### Use pipeline to search Encyclopedia Britannica for all articles related to Elon Musk and extract entities

# %%
query = "Elon Musk"
output = pipeline.run(data={"search":{"query":query}})

# %%
extracted_documents = output['ner']['documents']

# %%
extracted_documents

# %%



152 changes: 78 additions & 74 deletions ch8/named-entity-recognition/ner-with-haystack-news.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -111,6 +111,13 @@
"extractor.warm_up()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read the data and apply the extractor"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -125,6 +132,13 @@
"extractor.run(documents)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parse extracted entities and store "
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -133,13 +147,45 @@
"source": [
"# Extract named entities from the documents\n",
"extracted_documents = extract_named_entities_with_ids(documents)\n",
"df = pd.DataFrame(extracted_documents)\n",
"df.to_csv(\"ner_output.csv\", index=False)"
"df = pd.DataFrame(extracted_documents)\n"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'document_id': '0',\n",
" 'content': 'Budget to set scene for election\\n \\n Gordon Brown will seek to put the economy at the centre of Labour\\'s bid for a third term in power when he delivers his ninth Budget at 1230 GMT. He is expected to stress the importance of continued economic stability, with low unemployment and interest rates. The chancellor is expected to freeze petrol duty and raise the stamp duty threshold from £60,000. But the Conservatives and Lib Dems insist voters face higher taxes and more means-testing under Labour.\\n \\n Treasury officials have said there will not be a pre-election giveaway, but Mr Brown is thought to have about £2bn to spare.\\n \\n - Increase in the stamp duty threshold from £60,000 \\n - A freeze on petrol duty \\n - An extension of tax credit scheme for poorer families \\n - Possible help for pensioners The stamp duty threshold rise is intended to help first time buyers - a likely theme of all three of the main parties\\' general election manifestos. Ten years ago, buyers had a much greater chance of avoiding stamp duty, with close to half a million properties, in England and Wales alone, selling for less than £60,000. Since then, average UK property prices have more than doubled while the starting threshold for stamp duty has not increased. Tax credits As a result, the number of properties incurring stamp duty has rocketed as has the government\\'s tax take. The Liberal Democrats unveiled their own proposals to raise the stamp duty threshold to £150,000 in February.\\n \\n The Tories are also thought likely to propose increased thresholds, with shadow chancellor Oliver Letwin branding stamp duty a \"classic Labour stealth tax\". The Tories say whatever the chancellor gives away will be clawed back in higher taxes if Labour is returned to power. Shadow Treasury chief secretary George Osborne said: \"Everyone who looks at the British economy at the moment says there has been a sharp deterioration in the public finances, that there is a black hole,\" he said. \"If Labour is elected there will be a very substantial tax increase in the Budget after the election, of the order of around £10bn.\"\\n \\n But Mr Brown\\'s former advisor Ed Balls, now a parliamentary hopeful, said an examination of Tory plans for the economy showed there would be a £35bn difference in investment by the end of the next parliament between the two main parties. He added: \"I don\\'t accept there is any need for any changes to the plans we have set out to meet our spending commitments.\"\\n \\n For the Lib Dems David Laws said: \"The chancellor will no doubt tell us today how wonderfully the economy is doing,\" he said. \"But a lot of that is built on an increase in personal and consumer debt over the last few years - that makes the economy quite vulnerable potentially if interest rates ever do have to go up in a significant way.\" SNP leader Alex Salmond said his party would introduce a £2,000 grant for first time buyers, reduce corporation tax and introduce a citizens pension free from means testing. Plaid Cymru\\'s economics spokesman Adam Price said he wanted help to get people on the housing ladder and an increase in the minimum wage to £5.60 an hour.\\n',\n",
" 'meta': {'LOC': 'UK,England,Wales',\n",
" 'PER': 'George Osborne,Ed,Oliver Letwin,Gordon Brown,Brown',\n",
" 'ORG': 'Labour,Shadow Treasury,Treasury'}}"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extracted_documents[0]"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"df['content'] = df['content'].str.replace(\"\\n\",\" \")"
]
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": 65,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -172,111 +218,69 @@
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Budget to set scene for election\\n \\n Gordon B...</td>\n",
" <td>Budget to set scene for election Gordon Bro...</td>\n",
" <td>{'LOC': 'UK,England,Wales', 'PER': 'George Osb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Army chiefs in regiments decision\\n \\n Militar...</td>\n",
" <td>Army chiefs in regiments decision Military ...</td>\n",
" <td>{'LOC': 'Scotland,Iraq', 'PER': 'Eric,Joyce,Ge...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Howard denies split over ID cards\\n \\n Michael...</td>\n",
" <td>Howard denies split over ID cards Michael H...</td>\n",
" <td>{'LOC': '', 'PER': 'Davis,Ye,Michael Howard,Ti...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Observers to monitor UK election\\n \\n Minister...</td>\n",
" <td>Observers to monitor UK election Ministers ...</td>\n",
" <td>{'LOC': 'Britain,UK,Northern Ireland', 'PER': ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>Kilroy names election seat target\\n \\n Ex-chat...</td>\n",
" <td>Kilroy names election seat target Ex-chat s...</td>\n",
" <td>{'LOC': 'UK,Derbyshire,London,Erewash,Nottingh...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2220</th>\n",
" <td>2220</td>\n",
" <td>India opens skies to competition\\n \\n India wi...</td>\n",
" <td>{'LOC': 'Saudi Arabia,India,Gulf,US,Kuwait', '...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2221</th>\n",
" <td>2221</td>\n",
" <td>Yukos bankruptcy 'not US matter'\\n \\n Russian ...</td>\n",
" <td>{'LOC': 'Russia,sk,Gibraltar,US,Houston,Europe...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2222</th>\n",
" <td>2222</td>\n",
" <td>Survey confirms property slowdown\\n \\n Governm...</td>\n",
" <td>{'LOC': 'Wales,UK,Greater London,England', 'PE...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2223</th>\n",
" <td>2223</td>\n",
" <td>High fuel prices hit BA's profits\\n \\n British...</td>\n",
" <td>{'LOC': '', 'PER': 'Martin Broughton,Mike Powe...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2224</th>\n",
" <td>2224</td>\n",
" <td>US trade gap hits record in 2004\\n \\n The gap ...</td>\n",
" <td>{'LOC': 'US,America,China', 'PER': 'Bush', 'OR...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2225 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" document_id content \\\n",
"0 0 Budget to set scene for election\\n \\n Gordon B... \n",
"1 1 Army chiefs in regiments decision\\n \\n Militar... \n",
"2 2 Howard denies split over ID cards\\n \\n Michael... \n",
"3 3 Observers to monitor UK election\\n \\n Minister... \n",
"4 4 Kilroy names election seat target\\n \\n Ex-chat... \n",
"... ... ... \n",
"2220 2220 India opens skies to competition\\n \\n India wi... \n",
"2221 2221 Yukos bankruptcy 'not US matter'\\n \\n Russian ... \n",
"2222 2222 Survey confirms property slowdown\\n \\n Governm... \n",
"2223 2223 High fuel prices hit BA's profits\\n \\n British... \n",
"2224 2224 US trade gap hits record in 2004\\n \\n The gap ... \n",
"\n",
" meta \n",
"0 {'LOC': 'UK,England,Wales', 'PER': 'George Osb... \n",
"1 {'LOC': 'Scotland,Iraq', 'PER': 'Eric,Joyce,Ge... \n",
"2 {'LOC': '', 'PER': 'Davis,Ye,Michael Howard,Ti... \n",
"3 {'LOC': 'Britain,UK,Northern Ireland', 'PER': ... \n",
"4 {'LOC': 'UK,Derbyshire,London,Erewash,Nottingh... \n",
"... ... \n",
"2220 {'LOC': 'Saudi Arabia,India,Gulf,US,Kuwait', '... \n",
"2221 {'LOC': 'Russia,sk,Gibraltar,US,Houston,Europe... \n",
"2222 {'LOC': 'Wales,UK,Greater London,England', 'PE... \n",
"2223 {'LOC': '', 'PER': 'Martin Broughton,Mike Powe... \n",
"2224 {'LOC': 'US,America,China', 'PER': 'Bush', 'OR... \n",
" document_id content \\\n",
"0 0 Budget to set scene for election Gordon Bro... \n",
"1 1 Army chiefs in regiments decision Military ... \n",
"2 2 Howard denies split over ID cards Michael H... \n",
"3 3 Observers to monitor UK election Ministers ... \n",
"4 4 Kilroy names election seat target Ex-chat s... \n",
"\n",
"[2225 rows x 3 columns]"
" meta \n",
"0 {'LOC': 'UK,England,Wales', 'PER': 'George Osb... \n",
"1 {'LOC': 'Scotland,Iraq', 'PER': 'Eric,Joyce,Ge... \n",
"2 {'LOC': '', 'PER': 'Davis,Ye,Michael Howard,Ti... \n",
"3 {'LOC': 'Britain,UK,Northern Ireland', 'PER': ... \n",
"4 {'LOC': 'UK,Derbyshire,London,Erewash,Nottingh... "
]
},
"execution_count": 57,
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"ner_output.csv\", index=False)"
]
},
{
Expand Down
Loading

0 comments on commit ee779db

Please # to comment.