#!/usr/bin/python3.4
import asyncio
import hashlib
import json
import os

import aiohttp
import requests
import magic

from bs4 import BeautifulSoup


@asyncio.coroutine
def fetch_report_content(session, file_url, download_path, checksum):

    # Set hash check
    hash_check = hashlib.sha1()

    download_response = yield from session.get(file_url)

    try:
        with open(download_path, 'wb') as f_handle:
            while True:
                chunk = yield from download_response.content.read(1024)
                hash_check.update(chunk)
                if not chunk:
                    break
                f_handle.write(chunk)

        # Verify file contents based on expected checksum value
        if hash_check.hexdigest() != checksum:
            os.remove(download_path)
            raise ValueError("File integrity check failed")

    except Exception as unexpected_error:
        message = "[!] Download failure for {}".format(file_url)
        print(message, unexpected_error)
        download_response.close()

    else:
        # Identify filetype and add extension if PDF
        file_type = magic.from_file(download_path, mime=True)

        if file_type == "application/pdf":
            # File with PDF extension path
            pdf_extension_path = download_path + ".pdf"
            os.rename(download_path, pdf_extension_path)
            download_path = pdf_extension_path

        print("[+] Successfully downloaded {}".format(download_path))
        return download_path

    finally:
        yield from download_response.release()


@asyncio.coroutine
def fetch_report_url(session, report_link):

    # Download report preview page for parsing
    splash_response = yield from session.get(report_link)

    try:
        splash_page = yield from splash_response.content.read()

        # Parse preview page for desired elements to build download URL
        soup = BeautifulSoup(splash_page, 'lxml')
        sections = soup.find('body').find('script').contents[0].split(';')
        app_api = json.loads(sections[1].split('=')[1])['/app-api/enduserapp/shared-item']
        
        # Build download URL
        box_url = "https://app.box.com/index.php"
        box_args = "?rm=box_download_shared_file&shared_name={}&file_id={}"
        file_url = box_url + box_args.format(app_api['sharedName'], 'f_{}'.format(app_api['itemID']))

    except Exception as unexpected_error:
        message = "[!] Splash page retrieval failure for {}".format(report_link)
        print(message, unexpected_error)
        splash_response.close()

    else:
        return file_url

    finally:
        yield from splash_response.release()


@asyncio.coroutine
def download_report(session, report):
    report_date = report['Date']
    report_title = report['Title']
    report_year = report['Year']
    report_source = report['Source']
    report_link = report['Link']
    report_filename = report['Filename']
    report_sha1 = report['SHA-1']

    # Ensure directory exists
    os.makedirs(report_year, exist_ok=True)

    # Set download path
    download_path = os.path.join(report_year, report_filename)

    # File with PDF extension path
    pdf_extension_path = download_path + ".pdf"

    if os.path.exists(download_path) or os.path.exists(pdf_extension_path):
        print("[!] File {} already exists".format(report_filename))
    else:
        file_url = yield from fetch_report_url(session, report_link)
        report_path = yield from fetch_report_content(session, file_url, download_path, report_sha1)


@asyncio.coroutine
def download_all_reports(loop, APT_reports):
    with aiohttp.ClientSession(loop=loop) as session:
        download_queue = [loop.create_task(download_report(session, report)) for report in APT_reports]
        yield from asyncio.wait(download_queue)


if __name__ == '__main__':
    # Retrieve APT Note Data
    github_url = "https://raw.githubusercontent.com/aptnotes/data/master/APTnotes.json"
    APTnotes = requests.get(github_url)

    if APTnotes.status_code == 200:
        # Load APT report metadata into JSON container
        APT_reports = json.loads(APTnotes.text)

        # Reverse order of reports in order to download newest to oldest
        APT_reports.reverse()

        # Set semaphore for rate limiting
        sem = asyncio.Semaphore(10)

        # Create async loop
        loop = asyncio.get_event_loop()
        loop.run_until_complete(download_all_reports(loop, APT_reports))