# These are the helper libraries we're going to use

# We'll need to parse CSV files, we use one for the company
import csv
# We need to work on paths on the disk (files)
import pathlib
# We need to search file names for years
import re
# We need to make requests to the internet
import requests

# Helps us parse dates from file names
from dateutil.parser import parse, _parser

# The "Google search" part
from googleapiclient.discovery import build

# My credentials for google
my_api_key = "AIzaSyAFIj698EzO--gjjrtDs31cAbh2bDIG7Yw"
my_cse_id = "300863d8828cfcee5"

# This lets me read in the CSV file I downloaded that includes all the S&P companies
company_file = csv.DictReader(open("companies.csv"))

# Actually does the google search, don't worry about this code
def google_search(search_term, **kwargs):
    service = build("customsearch", "v1", developerKey=my_api_key)
    res = service.cse().list(q=search_term, cx=my_cse_id, **kwargs).execute()
    return res['items']

# Where to put the downloaded reports
results_folder = pathlib.Path('/Volumes/docs/csr_reports')

# This tries to get the year of the report from the google result
# It's best effort, and could be improved
def try_get_year(search_result):
    # First, try to get the year from the title of the google result
        return parse(result['title'], fuzzy=True).year

    # If that fails, try to get the year from the "snippet", that little
    # text below the link on google. This doesn't work a lot, because the snippets
    # say things like "In 1997, our company started. This is our 2020 report" and the code
    # can't decide which of those years to pick
        year_of_report = parse(result['snippet'], fuzzy=True).year

    # Next, just search for any 4 numbers in the snippet, and return the first one
        match = re.search('\d{4}', search_result['snippet'])
        return match.group()
        # If all else fails, just return 2020
        return 2020

# Read this as "FOR every row IN the company file spreadsheet"
for row in company_file:
    # First, grab the name
    company_name = row['Name']
    # Then, grab the symbol
    company_symbol = row['Symbol']

    # First, check and see if we already downloaded a report. This looks in our downloads folder
    # for a file that starts with the companies symbol, and creates a list of those files
    potential_matching_reports = list(results_folder.glob(f"{company_symbol}*"))

    # If the list of files is greater than 0, we already have the report
    if len(potential_matching_reports) > 0:
        # Just prints information to the screen when this script is run
        print(f"Report for company {company_name} already exists... skipping")
    # Otherwise, we should download the report
        # Set the google search as "<Company name> corporate sustainability report 2020 filetype:pdf"
        searchterm = f"{company_name} corporate sustainability report 2020 filetype:pdf"
        # Print the searchterm to the screen
        print(f"Searching with \"{searchterm}\"")
        # Actually do the google search
        search_results = google_search(searchterm)

        # For each result from google, do something
        for result in search_results:
            # First, call the code above to try to parse the year out of the google result
            year_of_report = try_get_year(result)
            # Tell the user we're going to try to download a link
            print(f"Retrieving link from {result['link']}")
                # Make a request to the link that google gets us for the file. Timeout after 60 seconds
                request = requests.get(result['link'], timeout=60, stream=True)
                # This is where we put the final file. This can be read as "reports/<symbol>_<year>_csr.pdf
                output_pdf = results_folder / f"{company_symbol}_{year_of_report}_csr.pdf"

                # Open the file on the system to write to it.
                with open(output_pdf, 'wb') as fh:
                    # Start downloading the file in chunks of 1MB. As we receive a chunk, write it to the file
                    for chunk in request.iter_content(1024 * 1024):
                # If something goes wrong (the download times out, the website doesn't allow us to download the file, etc)
                # just give up and print
                print(f"Failed to download report for company {company_name}")
            # This means we only parse the FIRST google result. This is kind of stupid, this is the equivalent
            # of trusting the first link is always right. I don't know how else to do it, so we just work on the first
            # result and then "break" out of the loop, and don't do anything with the rest of the results