diff --git a/README.md b/README.md index 3d2707e..666da9e 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,10 @@ stanford_pipeline Program to run scraped news stories through Stanford's CoreNLP program. -Complete work-in-progress repo. +The program pulls stories added to the database within the past day and that +aren't currently parsed using CoreNLP. Once parsed, the parsetrees are placed +back into the database. The program is currently set to proccess the first six +sentences of a story. Usage ----- diff --git a/parser.py b/parser.py index 9a405bd..43d8a30 100644 --- a/parser.py +++ b/parser.py @@ -6,6 +6,23 @@ def stanford_parse(coll, stories, stanford): + """ + Runs stories pulled from the MongoDB instance through CoreNLP. Updates + the database entry with the parsed sentences. Currently set to run the + first 6 sentences. + + Parameters + ---------- + + coll: pymongo.collection.Collection. + Collection within MongoDB that holds the scraped news stories. + + stories: pymongo.cursor.Cursor. + Stories pulled from the MongoDB instance. + + stanford: String. + Directory path for Stanford CoreNLP. + """ logger = logging.getLogger('stanford') logger.info('Setting up CoreNLP.') diff --git a/process.py b/process.py index e59a1f9..1c79a28 100644 --- a/process.py +++ b/process.py @@ -13,6 +13,19 @@ def make_conn(db_auth, db_user, db_pass): """ Function to establish a connection to a local MonoDB instance. + Parameters + ---------- + + db_auth: String. + MongoDB database that should be used for user authentication. + + db_user: String. + Username for MongoDB authentication. + + db_user: String. + Password for MongoDB authentication. + + Returns ------- @@ -31,8 +44,8 @@ def make_conn(db_auth, db_user, db_pass): def query_today(collection, date): """ Function to query the MongoDB instance and obtain results for the desired - date range. The query constructed is: greater_than_date > results - < less_than_date. + date range. Pulls stories that aren't Stanford parsed yet + (``"stanford: 0"``) and that were added within the last day. Parameters ---------- @@ -40,31 +53,14 @@ def query_today(collection, date): collection: pymongo.collection.Collection. Collection within MongoDB that holds the scraped news stories. - less_than_date: Datetime object. - Date for which results should be older than. For example, - if the date running is the 25th, and the desired date is - the 24th, then the `less_than_date` is the 25th. - - greater_than_date: Datetime object. - Date for which results should be older than. For - example, if the date running is the 25th, and the - desired date is the 24th, then the `greater_than_date` - is the 23rd. - - write_file: Boolean. - Option indicating whether to write the results from the web - scraper to an intermediate file. Defaults to false. + date: String. + Current date that the program is running. Returns ------- - posts: List. - List of dictionaries of results from the MongoDB query. - - - final_out: String. - If `write_file` is True, this contains a string representation - of the query results. Otherwise, contains an empty string. + posts: pymongo.cursor.Cursor. + Results from the MongoDB query. """