Skip to content

Commit 9b9be60

Browse files
authored
Merge pull request #2 from pratikrelekar/xinyao
Update module.py
2 parents 6b866a1 + beb9167 commit 9b9be60

File tree

1 file changed

+85
-2
lines changed

1 file changed

+85
-2
lines changed

module.py

+85-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
"""
2-
32
Goal: Read in raw Edgar files, downloadable from the
43
...
54
@@ -8,10 +7,94 @@
87
98
See Example.py for implementation of the EdgarAnalyzer functions
109
"""
10+
import os
11+
import re
12+
import unicodedata
13+
from bs4 import BeautifulSoup
14+
import glob
15+
import pandas as pd
1116

1217
class EdgarAnalyzer(object):
1318
"""
1419
Object class to read in Edgar 10k Data
15-
1620
"""
21+
def clean_html_content(html_content):
22+
# Parse the HTML content
23+
try:
24+
soup = BeautifulSoup(html_content, "html.parser")
25+
except Exception as e:
26+
print(f"html.parser failed: {e}")
27+
try:
28+
soup = BeautifulSoup(html_content, "lxml")
29+
except Exception as e:
30+
print(f"lxml failed: {e}")
31+
try:
32+
soup = BeautifulSoup(html_content, "html5lib")
33+
except Exception as e:
34+
print(f"html5lib failed: {e}")
35+
raise
36+
# soup = BeautifulSoup(html_content, "html.parser")
37+
38+
# Removes all tags
39+
for tag in soup.find_all(True):
40+
tag.unwrap()
41+
42+
# Extract text and normalize Unicode characters
43+
text = soup.get_text(separator=' ')
44+
text = unicodedata.normalize('NFKD', text)
45+
46+
# Remove any remaining HTML entities
47+
text = re.sub(r'<.*?>', '', text) # Remove any remaining HTML tags
48+
text = re.sub(r'&[a-zA-Z0-9#]+;', ' ', text)
49+
50+
# Define a function to remove gibberish based on specific patterns
51+
def remove_gibberish(text):
52+
# Removes long sequences of characters without spaces
53+
# this will remove long normal words, e.g. characteristics
54+
# text = re.sub(r'\b\w{15,}\b', '', text)
55+
56+
# Removes sequences with high special character density
57+
text = re.sub(r'[!@#$%^&*()_+={}\[\]:;"\'<>,.?/\\|`~\-]{5,}', '', text)
58+
59+
# Removes lines that are mostly numbers or symbols
60+
text = re.sub(r'^[^a-zA-Z\s]*$', '', text, flags=re.MULTILINE)
61+
62+
# Additional patterns for gibberish removal
63+
# Removes base64 encoded text patterns
64+
text = re.sub(r'(begin [0-9]{3} [^\n]+\n(.*\n)+end)', '', text, flags=re.MULTILINE)
65+
66+
# Removes lines that contain too many non-alphanumeric characters
67+
text = re.sub(r'^[^\w\s]{10,}$', '', text, flags=re.MULTILINE)
68+
69+
return text
70+
71+
72+
def clean_noisy_text(text):
73+
# Split the text into individual words
74+
words = text.split()
75+
76+
# Define a function to identify "noisy strings"
77+
def is_noisy(word):
78+
# If the word is longer than 15 characters and contains:
79+
# - Mixed uppercase and lowercase letters
80+
# - Numbers
81+
if len(word) > 15 and (
82+
re.search(r'[A-Z]', word) and re.search(r'[a-z]', word) and re.search(r'\d', word)
83+
):
84+
return True
85+
# If the word is longer than 15 characters and contains symbols
86+
if len(word) > 15 and re.search(r'[^A-Za-z0-9]', word):
87+
return True
88+
return False
89+
90+
# Keep only meaningful words, removing the noisy ones
91+
cleaned_words = [word for word in words if not is_noisy(word)]
92+
93+
# Return the cleaned text
94+
return ' '.join(cleaned_words)
95+
96+
text = remove_gibberish(text)
97+
text = clean_noisy_text(text)
98+
text = ' '.join(text.split())
1799

100+
return text

0 commit comments

Comments
 (0)