1
1
"""
2
-
3
2
Goal: Read in raw Edgar files, downloadable from the
4
3
...
5
4
8
7
9
8
See Example.py for implementation of the EdgarAnalyzer functions
10
9
"""
10
+ import os
11
+ import re
12
+ import unicodedata
13
+ from bs4 import BeautifulSoup
14
+ import glob
15
+ import pandas as pd
11
16
12
17
class EdgarAnalyzer (object ):
13
18
"""
14
19
Object class to read in Edgar 10k Data
15
-
16
20
"""
21
+ def clean_html_content (html_content ):
22
+ # Parse the HTML content
23
+ try :
24
+ soup = BeautifulSoup (html_content , "html.parser" )
25
+ except Exception as e :
26
+ print (f"html.parser failed: { e } " )
27
+ try :
28
+ soup = BeautifulSoup (html_content , "lxml" )
29
+ except Exception as e :
30
+ print (f"lxml failed: { e } " )
31
+ try :
32
+ soup = BeautifulSoup (html_content , "html5lib" )
33
+ except Exception as e :
34
+ print (f"html5lib failed: { e } " )
35
+ raise
36
+ # soup = BeautifulSoup(html_content, "html.parser")
37
+
38
+ # Removes all tags
39
+ for tag in soup .find_all (True ):
40
+ tag .unwrap ()
41
+
42
+ # Extract text and normalize Unicode characters
43
+ text = soup .get_text (separator = ' ' )
44
+ text = unicodedata .normalize ('NFKD' , text )
45
+
46
+ # Remove any remaining HTML entities
47
+ text = re .sub (r'<.*?>' , '' , text ) # Remove any remaining HTML tags
48
+ text = re .sub (r'&[a-zA-Z0-9#]+;' , ' ' , text )
49
+
50
+ # Define a function to remove gibberish based on specific patterns
51
+ def remove_gibberish (text ):
52
+ # Removes long sequences of characters without spaces
53
+ # this will remove long normal words, e.g. characteristics
54
+ # text = re.sub(r'\b\w{15,}\b', '', text)
55
+
56
+ # Removes sequences with high special character density
57
+ text = re .sub (r'[!@#$%^&*()_+={}\[\]:;"\'<>,.?/\\|`~\-]{5,}' , '' , text )
58
+
59
+ # Removes lines that are mostly numbers or symbols
60
+ text = re .sub (r'^[^a-zA-Z\s]*$' , '' , text , flags = re .MULTILINE )
61
+
62
+ # Additional patterns for gibberish removal
63
+ # Removes base64 encoded text patterns
64
+ text = re .sub (r'(begin [0-9]{3} [^\n]+\n(.*\n)+end)' , '' , text , flags = re .MULTILINE )
65
+
66
+ # Removes lines that contain too many non-alphanumeric characters
67
+ text = re .sub (r'^[^\w\s]{10,}$' , '' , text , flags = re .MULTILINE )
68
+
69
+ return text
70
+
71
+
72
+ def clean_noisy_text (text ):
73
+ # Split the text into individual words
74
+ words = text .split ()
75
+
76
+ # Define a function to identify "noisy strings"
77
+ def is_noisy (word ):
78
+ # If the word is longer than 15 characters and contains:
79
+ # - Mixed uppercase and lowercase letters
80
+ # - Numbers
81
+ if len (word ) > 15 and (
82
+ re .search (r'[A-Z]' , word ) and re .search (r'[a-z]' , word ) and re .search (r'\d' , word )
83
+ ):
84
+ return True
85
+ # If the word is longer than 15 characters and contains symbols
86
+ if len (word ) > 15 and re .search (r'[^A-Za-z0-9]' , word ):
87
+ return True
88
+ return False
89
+
90
+ # Keep only meaningful words, removing the noisy ones
91
+ cleaned_words = [word for word in words if not is_noisy (word )]
92
+
93
+ # Return the cleaned text
94
+ return ' ' .join (cleaned_words )
95
+
96
+ text = remove_gibberish (text )
97
+ text = clean_noisy_text (text )
98
+ text = ' ' .join (text .split ())
17
99
100
+ return text
0 commit comments