-
Notifications
You must be signed in to change notification settings - Fork 2
/
createDB.py
65 lines (56 loc) · 2.56 KB
/
createDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Generate the list of index files archived in EDGAR since start_year (earliest: 1993) until the most recent quarter
import datetime
import sqlite3
import requests
import pandas as pd
from sqlalchemy import create_engine
import csv
import os,re
current_year = datetime.date.today().year
current_quarter = (datetime.date.today().month - 1) // 3 + 1
start_year = 1993
next_year=2018
#years = list(range(start_year, current_year))
years = list(range(start_year, next_year)) #[)
quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
history = [(y, q) for y in years for q in quarters]
#for i in range(1, current_quarter + 1): #may have memeory error
#history.append((current_year, 'QTR%d' % i))
urls = ['https://www.sec.gov/Archives/edgar/full-index/%d/%s/master.idx' % (x[0], x[1]) for x in history]
urls.sort()
# Download index files and write content into SQLite
con = sqlite3.connect('edgar_idx.db')
cur = con.cursor() #creating cursor as tuple
cur.execute('DROP TABLE IF EXISTS idx')
cur.execute('CREATE TABLE idx (cik TEXT, conm TEXT, type TEXT, date TEXT, path TEXT)')
for url in urls:
lines = requests.get(url).text.splitlines() #combine Q1Q2Q3Q4
records = [tuple(line.split('|')) for line in lines[11:]]
cur.executemany('INSERT INTO idx VALUES (?, ?, ?, ?, ?)', records)
print(url, 'downloaded and wrote to SQLite')
con.commit()
con.close()
# Write SQLite database to Stata
engine = create_engine('sqlite:///edgar_idx.db') #(r’sqlite:///C:\path\to\edgar_idx.db’)
with engine.connect() as conn, conn.begin():
data = pd.read_sql_table('idx', conn)
data = data[data.type=='10-K']
data.index = range(1,len(data)+1)
#data['conm'] = data['conm'].str.replace('\s\s+',' ')
#data.to_stata('edgar_idx2010.dta')
data.to_csv('edgar_idx.csv')
with open('edgar_idx.csv', newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for line in reader:
conm = re.sub('[\\\\/]','',line[2])
saveas = line[0] + '-' + conm + '-' + line[4].replace('/', '-') #Company Name+Date
#saveas = '-'.join([line[0], line[1].replace('/', '-'), line[2].replace('/', '-'), line[3]])
# Reorganize to rename the output filename.
url = 'https://www.sec.gov/Archives/' + line[5].strip()
print(saveas)
#with open(saveas, 'wb') as f:
if not os.path.exists('annual_report'):
os.makedirs('annual_report')
with open('annual_report\%s.txt'%saveas, 'wb') as f:
f.write(requests.get('%s'%url).content)
print(url, 'downloaded and wrote to text file')