Module QueryWRDS

Expand source code
# MIT License
# 
# Copyright (c) 2023 Andrew Maurice Perry
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import wrds
import numpy as np
import pandas as pd
import datetime
import sqlalchemy
from cprint import *
from pandas.tseries.offsets import *
import time
from pympler import asizeof
import pathlib
from py_functions import wavg, winsorize
import subprocess

import warnings
warnings.simplefilter(action = 'ignore', category = RuntimeWarning)

np.seterr(divide='ignore')

class QueryWRDS:

    # TODO: 
    #       (1) add more detail to string repr
    #       (2) setup update reminder if information is over a year out of date
    #       (3) blocking so it will work on other computers

    def __init__(self, WRDS_username: str, local_db_path: pathlib.Path = None, update_all_tables: bool = False, tables_to_update: list = [], update_tolerance: int = 3) -> None:
        """
        Initalizers the QueryWRDS class. During this process all of the files need from
            WRDS are downloaded to create a local SQL database. Additionally, combined files
            are created. The first files that are created are CRSP_M (monthly) and CRSP_D (daily). These
            tables contain all of the variables in the stock file, names file, and delisting 
            file. Additionally, CCM file is created; this 
            file contain the most commonly used CRSP varaiables and the Compustat variables
            used to make the standard Fama-French anomaly characteristics, the standard 
            anomaly characteristics are created

        Parameters
        ___________
        WRDS_username: Personal WRDS username.

        local_db_path: default = None; Location to create and read from the local SQL database.

        update_all_tables: default = False; If true the local database is deleted and recreated 
            by downloading form WRDS.

        tables_to_update: default = []; List of tables to update from WRDS.

        update_tolerance: default = 3; Number of quarters the tables can be out of date.

        Note
        _____
        If no 'local_db_path' is given then the WRDS database is created in the current directory

        Note
        _____
        The table names in the local SQL database mirror those found on WRDS but with underscores
            replcaing periods. Thus, when updating tables need to use the local names of the tables
            (i.e.) CRSP_MSF instead of CRSP.MSF

        Note
        _____
        Anomally characterisitcs created:
            * bm:   Book-to-Market
            * pe:   Price-to-Earnings
            * cfp:  Cashflow-to-Price
            * inv:  Investment
            * op:   Operating Profitablility
        """
        # list of tables from WRDS to make local
        WRDS_TABLES = ['FF.FACTORS_DAILY',          # FamaFrench Factors daily (used for risk-free rate) 
                       'FF.FACTORS_MONTHLY',        # FamaFrench Factors monthly (used for risk-free rate)
                       'CRSP.CCMXPF_LINKTABLE',     # CCM link table used to merge CRSP and Compustat
                       'CRSP.MSEDELIST',            # CRSP monthly delist events
                       'CRSP.MSF',                  # CRSP monthly stock file
                       'CRSP.MSENAMES',             # CRSP monthly event file
                       'COMPA.FUNDA',               # Compustat annual observations
                       'COMPA.FUNDQ']               # Compustat quarterly observations            


        # list of created tables
        CREATED_TABLES = ['CRSP_M',     # CRSP monthly file (i.e. merged CRSP.MSF, CRSP.MSENAMES, CRSP.MSEDELIST) 
                          'CCM']        # CRSP/Compustat merged file 

        # set class attributes
        self.WRDS_username = WRDS_username
        if(local_db_path is None):
            # set location to be in the current directory
            self.local_db_path = pathlib.Path('WRDS.db')
        else:
            # user specified loaction
            self.local_db_path = local_db_path

        self.update_all_tables = update_all_tables
        self.tables_to_update = tables_to_update
        self.update_tolerance = update_tolerance
        self.today_date = datetime.date.today().strftime('%Y-%m-%d')

        # used to update all tables
        if(self.update_all_tables and self.local_db_path.exists()):
            cprint.warn('Updating the tables in the local database. This process could take a long time...')
            os.remove(self.local_db_path)

        # create sql engine
        self.sql_engine = sqlalchemy.create_engine('sqlite:///' + str(self.local_db_path))

        # list of current tables
        # check to see if all required tables are present, if not load the ones that are missing
        inspect = sqlalchemy.inspect(self.sql_engine)
        self.curr_tables = inspect.get_table_names()

        # delete tables that should be updated
        for table_name in self.tables_to_update:
            if(table_name in self.curr_tables):
                with self.sql_engine.connect() as conn:
                    if(table_name not in CREATED_TABLES):
                        _ = conn.execute(f"""DROP TABLE {table_name}""")
                        # drop combo files and remake 
                        for cr_table in CREATED_TABLES:
                            _ = conn.execute(f'DROP TABLE {cr_table}')
                    else:
                        # only drop the created table and remake
                        _ = conn.execute(f'DROP TABLE {table_name}')

        table_names = [name.replace('.', '_') for name in WRDS_TABLES] # local table names

        # check CSV directory for files to include
        CSV_directory = self.local_db_path.parent / 'CSVtoSQL'
        for csvfile in os.listdir(CSV_directory):
            f = os.path.join(CSV_directory, csvfile)
            if(os.path.isfile(f)):
                filepath = pathlib.Path(f)
                tablename = filepath.name.strip('.csv')
                if(tablename in self.curr_tables): continue
                s = time.time()
                cprint.info(f'Adding {filepath.name} to SQL database {self.local_db_path.name}...')
                subprocess.call(['sqlite3', f'{self.local_db_path}', '.mode csv', f'.import {filepath} {tablename}', '.mode columns'])
                e = time.time()
                cprint.info(f'Finished {filepath.name}: {round(e - s, 3)}s')

        # update current tables
        self.curr_tables = inspect.get_table_names()

        # read in the data from WRDS
        if(not all(elem in self.curr_tables for elem in table_names)):
            missing_tables = list(set(table_names) - set(inspect.get_table_names()))
            cprint.warn(f'The following tables are missing from the local database: {missing_tables}. Querying WRDS to add them to the local database.')
            cprint.info('Connecting to WRDS...')
            self.WRDS_db = wrds.Connection(username = self.WRDS_username)
            for table_name in missing_tables:
                table = table_name.replace('_', '.', 1)
                print('-------------------------------------')
                cprint.info(f'Starting {table}')
                s = time.time()
                sql_str = '''SELECT * FROM ''' + table
                # download the data to a dataframe
                df = self.WRDS_db.raw_sql(sql_str)
                cprint.ok(f'Dataframe in memory: {asizeof.asizeof(df) / (10 ** 9)}GB')
                # add end of month column for CRSP_MSEDELIST
                if(table_name == 'CRSP_MSEDELIST'): df['date'] = df.dlstdt + MonthEnd(0)
                if(table_name == 'CRSP_DSEDELIST'): df['date'] = df.dlstdt # create date column for merging
                # write the dataframe to the local sql database
                df.to_sql(table_name, con = self.sql_engine, if_exists = 'replace', index = False)
                del df
                e = time.time()
                cprint.info(f'Finished {table}: {round(e - s, 3)}s')
                print('-------------------------------------\n')
            cprint.info('Raw WRDS files have been added to the local databse.')

        if('CRSP_M' not in self.curr_tables):
            cprint.info(f'Creating combined data table CRSP_M...')
            sf_df = pd.read_sql(f"""SELECT * FROM CRSP_MSF LIMIT 1""", con = self.sql_engine)
            names_df = pd.read_sql(f"""SELECT * FROM CRSP_MSENAMES LIMIT 1""", con = self.sql_engine)
            delsit_df = pd.read_sql(f"""SELECT * FROM CRSP_MSEDELIST LIMIT 1""", con = self.sql_engine)
            vars_to_select = ''
            for var in list(sf_df.columns):
                vars_to_select += f'CRSP_MSF.{var}, '
            for var in list(set(list(names_df.columns)) - set(list(sf_df.columns))):
                vars_to_select += f'CRSP_MSENAMES.{var}, '
            for var in list(set(list(delsit_df.columns)) - set(list(sf_df.columns)) - set(list(names_df.columns))):
                    vars_to_select += f'CRSP_MSEDELIST.{var}, '
            vars_to_select = vars_to_select[:-2]
            sql_dic = {'vars': vars_to_select}
            sql_str = '''CREATE TABLE CRSP_M AS 
                         SELECT {0} FROM CRSP_MSF 
                         LEFT JOIN CRSP_MSENAMES ON CRSP_MSF.permno = CRSP_MSENAMES.permno AND CRSP_MSENAMES.namedt <= CRSP_MSF.date AND CRSP_MSF.date <= CRSP_MSENAMES.nameendt 
                         LEFT JOIN CRSP_MSEDELIST ON CRSP_MSF.permno = CRSP_MSEDELIST.permno AND CRSP_MSF.date = CRSP_MSEDELIST.date'''.format(sql_dic['vars'])
            with self.sql_engine.connect() as conn:
                _ = conn.execute(sql_str)

            cprint.info('Combined CRSP tables have been created.')

        # create merged CRSP and Compustat table
        if('CCM' not in self.curr_tables):
            table = 'CCM'
            cprint.info(f'Creating combined CRSP and Compustat table: {table}')

            start_date = datetime.date(1900, 6, 30)
            end_date = datetime.date(2100, 6, 30)

            # Compustat -------------------------------------------------------------------------------------------
    
            COMP_df = self.query_Compustat(start_date, end_date, 'A', sub_vars = ['ticker'], add_vars = ['years_in', 'fyear', 'revt', 'adjex_f'])
            COMP_df['year'] = COMP_df.datadate.dt.year

            # create preferrerd stock
            COMP_df['ps'] = np.where(COMP_df.pstkrv.isnull(), COMP_df.pstkl, COMP_df.pstkrv)
            COMP_df.ps = np.where(COMP_df.ps.isnull(), COMP_df.pstk, COMP_df.ps)
            COMP_df.ps = np.where(COMP_df.ps.isnull(), 0, COMP_df.ps)
            COMP_df.txditc = COMP_df.txditc.fillna(0)

            # create book equity
            COMP_df['be'] = np.where(COMP_df.fyear < 1993, COMP_df.seq + COMP_df.txditc - COMP_df.ps, COMP_df.seq - COMP_df.ps) 

            # earnings
            COMP_df['earn'] = np.where(~COMP_df.ib.isnull(), COMP_df.ib, np.nan)

            # operating profitability
            COMP_df['xp_allnan'] = (COMP_df.cogs.isnull()) & (COMP_df.xsga.isnull()) & (COMP_df.xint.isnull())
            COMP_df['profit'] = COMP_df.revt - COMP_df.cogs.fillna(0) - COMP_df.xint.fillna(0) - COMP_df.xsga.fillna(0)
            COMP_df['op'] = np.where(COMP_df.be + COMP_df.mib != 0, COMP_df.profit / (COMP_df.be + COMP_df.mib.fillna(0)), np.nan)
            COMP_df.op = np.where(((~COMP_df.op.isnull()) & (~COMP_df.revt.isnull()) & (~COMP_df.xp_allnan)), COMP_df.op, np.nan)

            # NOTE: Compustat data yields gross outliers in 'op' w/ ratios as large as '1,000'.
            #       To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
            #       values for 'op' outside the 99th percentile are set to missing.
            COMP_df.op = np.where((COMP_df.op <= COMP_df.op.quantile(0.99)), COMP_df.op, np.nan)

            # investment
            try:
                COMP_df['inv'] = np.log(COMP_df['at']) - np.log(COMP_df.groupby(by = ['gvkey'])['at'].shift(1))
            except FloatingPointError:
                COMP_df['inv'] = (COMP_df['at'] / COMP_df.groupby(by = ['gvkey'])['at'].shift(1)) - 1
            COMP_df.inv = np.where(~COMP_df.inv.isnull(), COMP_df.inv, np.nan)

            # NOTE: Compustat data yields gross outliers in 'inv' w/ percentages as low as '-100%' and as large as '10,000%'.
            #       These outliers are pervasive on the left tail of the distribution.
            #       To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
            #       values for 'inv' outside [15th, 99th] percentiles are winsorized.
            COMP_df.inv = np.where((COMP_df.inv.quantile(0.15) <= COMP_df.inv), COMP_df.inv, COMP_df.inv.quantile(0.15))
            COMP_df.inv = np.where((COMP_df.inv <= COMP_df.inv.quantile(0.99)), COMP_df.inv, COMP_df.inv.quantile(0.99))

            # cash flow
            COMP_df['cf'] = COMP_df.ib + COMP_df.txdi.fillna(0) + COMP_df.dpre.fillna(0)
            COMP_df.cf = np.where(~COMP_df.cf.isnull(), COMP_df.cf, np.nan)

            # accruals
            COMP_df['csho_adj'] = np.where((COMP_df.csho * COMP_df.adjex_f > 0), COMP_df.csho * COMP_df.adjex_f, np.nan)
            COMP_df['owcap_adj'] = ((COMP_df.act - COMP_df.che) - (COMP_df.lct.fillna(0) - COMP_df.dlc.fillna(0))) / COMP_df.csho_adj
            COMP_df['d_owcap_adj'] = COMP_df.owcap_adj - COMP_df.groupby(by = ['gvkey'])['owcap_adj'].shift(1)
            COMP_df['ac'] = np.where(~COMP_df.csho_adj.isnull(), COMP_df.d_owcap_adj / (COMP_df.be / COMP_df.csho_adj), np.nan)

            # NOTE: Compustat data yields gross outliers in 'ac' for June of each year {t} w/ ratios as low as '-200' and as large as '200'.
            #       To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
            #       values for 'ac' less than '-200' and values for 'ac' larger than '200' are set to missing.
            COMP_df.ac = np.where(-200 <= COMP_df.ac, COMP_df.ac, np.nan)
            COMP_df.ac = np.where(COMP_df.ac <= 200, COMP_df.ac, np.nan)

            # net shares issused
            COMP_df['ni_csho_adj'] = np.where(COMP_df.csho * COMP_df.adjex_f > 0, COMP_df.csho * COMP_df.adjex_f, np.nan)
            try:
                COMP_df['nsi'] = np.log(COMP_df.ni_csho_adj) - np.log(COMP_df.groupby(by = ['gvkey'])['ni_csho_adj'].shift(1))
            except FloatingPointError:
                COMP_df['nsi'] = (COMP_df.ni_csho_adj / COMP_df.groupby(by = ['gvkey'])['ni_csho_adj'].shift(1)) - 1
            COMP_df.nsi = np.where(~COMP_df.nsi.isnull(), COMP_df.nsi, np.nan)

            # NOTE: Compustat data yields outliers in 'ni' w/ ratios as large as '20'.
            #       To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
            #       values for 'ni' outside the 99.9th percentile are set to missing.
            COMP_df.nsi = np.where(COMP_df.nsi <= COMP_df.nsi.quantile(0.999), COMP_df.nsi, np.nan)
            COMP_df = COMP_df.drop(columns = ['owcap_adj', 'xp_allnan'])

            # CRSP ------------------------------------------------------------------------------------------------

            CRSP_df = self.query_CRSP(start_date, end_date, 'M')
            CRSP_df['jdate'] = CRSP_df.date

            # create timing columns
            CRSP_df['year'] = CRSP_df['jdate'].dt.year
            CRSP_df['month'] = CRSP_df['jdate'].dt.month

            # turnover (turn)
            # The turnover (TURN) for each stock is defined the monthly traded volume scaled by the total number of shares outstanding.
            CRSP_df['turn'] = CRSP_df.vol / CRSP_df.shrout

            # traded volume in dollars (dvol)
            # The traded volume in dollars (DVOL) is defined as the number of shares traded in a given month multiplied by the closing stock price.
            CRSP_df['dvol'] = CRSP_df.vol * CRSP_df.prc.abs()

            # calculate prior returns
            # Prior 1-1 is the cummulative return in [t - 1]
            # Prior 2-12 is the cummulative return from [t - 12] to [t - 2]
            # Prior 13-60 is the cummulative return from [t - 60] to [t - 13]
            for ret_typ in ['adjret', 'adjretx']:
                for typ in [(1, 1), (2, 12), (13, 60)]:
                    name = f'pr{typ[0]}_{typ[1]}' if(ret_typ == 'adjret') else f'prx{typ[0]}_{typ[1]}'
                    CRSP_df[name] = 1
                    for i in range(typ[0], typ[1] + 1):
                        CRSP_df[f'{ret_typ}_L{i}'] = 1 + CRSP_df.groupby(by = ['permno'])[ret_typ].shift(i)
                        CRSP_df[name] *= CRSP_df[f'{ret_typ}_L{i}']
                    CRSP_df = CRSP_df.drop(CRSP_df.filter(regex = '_L').columns, axis = 1)
                    CRSP_df[name] -= 1

            ### Aggregate Market Cap ###
            # sum of me across different permno belonging to same permco a given date
            crsp_summe = CRSP_df.groupby(['jdate','permco'])['me'].sum().reset_index()

            # largest mktcap within a permco/date
            crsp_maxme = CRSP_df.groupby(['jdate','permco'])['me'].max().reset_index()

            # join by jdate/maxme to find the permno
            CRSP_df = CRSP_df.merge(crsp_maxme, how='inner', on=['jdate','permco','me'])

            # drop me column and replace with the sum me
            CRSP_df = CRSP_df.drop(columns = ['me'])

            # join with sum of me to get the correct market cap info
            CRSP_df = CRSP_df.merge(crsp_summe, how='inner', on=['jdate','permco'])

            ### July to June dates
            CRSP_df['ffdate'] = CRSP_df['jdate'] + MonthEnd(-6)
            CRSP_df['ffyear'] = CRSP_df['ffdate'].dt.year
            CRSP_df['ffmonth'] = CRSP_df['ffdate'].dt.month
            CRSP_df['1+adjretx'] = 1 + CRSP_df.adjretx

            # cumret by stock and fama-french year
            CRSP_df['ffyear_cumretx'] = CRSP_df.groupby(by = ['permno', 'ffyear'])['1+adjretx'].cumprod()

            # lag of ffyear_cumretx
            CRSP_df['L_ffyear_cumretx'] = CRSP_df.groupby(by = ['permno'])['ffyear_cumretx'].shift(1)

            # lag market cap
            CRSP_df['L_me']=CRSP_df.groupby(by = ['permno'])['me'].shift(1)

            # if first permno then use me/(1+retx) to replace the missing value
            CRSP_df['months_in'] = CRSP_df.groupby(by = ['permno']).cumcount()
            CRSP_df.L_me = np.where(CRSP_df.months_in == 0, CRSP_df.me / CRSP_df['1+adjretx'], CRSP_df.L_me)

            # baseline me
            mebase = CRSP_df[CRSP_df.ffmonth == 1][['permno','ffyear', 'L_me']].rename(columns={'L_me': 'mebase'})

            # merge result back together
            CRSP_df = CRSP_df.merge(mebase, how = 'left', on = ['permno', 'ffyear'])
            CRSP_df['wt'] = np.where(CRSP_df.ffmonth == 1, CRSP_df.L_me, CRSP_df.mebase * CRSP_df.L_ffyear_cumretx)

            # Info as of June & December and merge together for characteristics
            CRSP_JUN_df = CRSP_df[CRSP_df.month == 6]
            CRSP_DEC_df = CRSP_df[CRSP_df.month == 12]
            CRSP_DEC_df = CRSP_DEC_df[['permno','date','jdate','me','year']].rename(columns={'me': 'dec_me'})
            CRSP_DEC_df.year += 1
            CRSP_DEC_df = CRSP_DEC_df[['permno','year','dec_me']]
            CRSP_JUN_df = CRSP_JUN_df.merge(CRSP_DEC_df, how = 'inner', on = ['permno', 'year'])
            CRSP_JUN_df = CRSP_JUN_df[['permno', 'date', 'jdate', 'shrcd', 'exchcd', 'adjret', 'me', 'wt', 'mebase', 'L_me', 'dec_me']]
            CRSP_JUN_df = CRSP_JUN_df.drop_duplicates()

            # query the link table
            link_table_df = self.query_link_table()
            CCM_df = COMP_df.merge(link_table_df, how='left', on=['gvkey'])
            CCM_df['jdate'] = CCM_df.year_end + MonthEnd(6)

            # set link date bounds
            CCM_df = CCM_df[(CCM_df.jdate >= CCM_df.linkdt) & (CCM_df.jdate <= CCM_df.linkenddt)]

            # link COMP_df and crsp
            CCM_JUN_df = CRSP_JUN_df.merge(CCM_df, how = 'inner', on = ['permno', 'jdate'])
            CCM_JUN_df['ffyear'] = CCM_JUN_df.jdate.dt.year

            # drop columns before merging
            CCM_JUN_df = CCM_JUN_df.drop(columns = ['mebase', 'L_me', 'linktype', 'linkprim', 'linkenddt', 'linkdt', 'date', 'jdate', 'shrcd', 'exchcd', 'adjret', 'me', 'wt', 'year', 'permco'])
            CRSP_df = CRSP_df.drop(columns = ['L_me', 'L_ffyear_cumretx', 'ffyear_cumretx', '1+adjretx', 'mebase', 'jdate'])

            # merge to monthly
            CCM_df = CRSP_df.merge(CCM_JUN_df, how = 'left', on = ['permno', 'ffyear'])
            CCM_df = CCM_df.dropna(subset = ['datadate'])
            CCM_df = CCM_df.sort_values(by = ['permno', 'date'])

            # create characterisitcs

            # book to market
            CCM_df['ffbm'] = np.where(CCM_df.dec_me != 0, CCM_df.be / CCM_df.dec_me, np.nan)
            CCM_df['bm'] = np.where(CCM_df.me != 0, CCM_df.be / CCM_df.me, np.nan)

            # earnings to price
            CCM_df['ffep'] = np.where(CCM_df.dec_me != 0, CCM_df.earn / CCM_df.dec_me, np.nan)
            CCM_df['ep'] = np.where(CCM_df.me != 0, CCM_df.earn / CCM_df.me, np.nan)

            # cashflow to price
            CCM_df['ffcfp'] = np.where(CCM_df.dec_me != 0, CCM_df.cf / CCM_df.dec_me, np.nan)
            CCM_df['cfp'] = np.where(CCM_df.me != 0, CCM_df.cf / CCM_df.me, np.nan)

            # market beta
            mkt_s = CCM_df.groupby('date').apply(wavg, 'adjret', 'me')
            mkt_s.name = 'mkt'
            mkt_s = mkt_s.to_frame().reset_index()
            CCM_df = CCM_df.merge(mkt_s, how = 'left', on = ['date'])

            CCM_df['cov'] = CCM_df.groupby(['permno'])[['adjret', 'mkt']].rolling(min_periods = 24, window = 60).cov().unstack()['adjret']['mkt'].reset_index(level = 'permno').rename(columns = {'mkt': 'cov'})[['cov']]
            CCM_df['var'] = CCM_df.groupby(['permno'])['mkt'].rolling(min_periods = 24, window = 60).var().reset_index(level = 'permno').rename(columns = {'mkt': 'var'})[['var']]
            CCM_df['beta'] = CCM_df['cov'] / CCM_df['var']

            ## additional characteristics
            # sale-to-price
            CCM_df['sp'] = np.where(CCM_df.me != 0, CCM_df.sale / CCM_df.me, np.nan)


            #### ISSUES WITH GROWTH VARS

            # earnings growth
            CCM_df['eg'] = CCM_df.earn.pct_change()

            # grwoth in sales
            CCM_df['grs'] = CCM_df.sale.pct_change()

            # growth in book value
            CCM_df['grb'] = CCM_df.be.pct_change()

            # growth in cashflow
            CCM_df['grcf'] = CCM_df.cf.pct_change()

            def _percentile_rank(df, var):
                ptiles = list(df[var].quantile(q = list(np.arange(start = 0, step = 0.01, stop = 1))))
                df[f'{var}_pr'] = 100
                for i in range(99, 0, -1):
                    mask = df[var] < ptiles[i]
                    df.loc[mask, f'{var}_pr'] = i
                return(df)
            
            for var in ['bm', 'sp', 'cfp', 'dp', 'eg', 'grs', 'grcf', 'grb']:
                CCM_df = _percentile_rank(CCM_df, var)
            
            s = time.time()

            # Pastor-Stambaugh Liquidty Index (psliq)
            # The Pastor-Stambaugh Liquidity Index (PSLIQ) is defined for each stock as follow. We run one regression for each calendar month based on each stock’s daily return, using the current daily
            # return in the left-hand side. The right-hand side variables are the lagged daily return as well as the lagged daily return interacted with the lagged traded dollar volume. The coefficient of the
            # interaction term is the measure of liquidity – for each stock and each month.

            # 









            e = time.time()
            print(CCM_df.head(50))
            print(f'time = {e - s}')


            # save merged dataframe to SQL database
            CCM_df = CCM_df.drop_duplicates()
            CCM_df.to_sql(table, con = self.sql_engine, if_exists = 'replace', index = False)
        
            cprint.ok('CRSP and Compustat merged table has been created.')
 
    def __str__(self) -> str:
        #TODO: print tables and columns from loacl database
        return(f"WRDS Username: {self.username}")

    def raw_sql(self, sql_str):
        """
        Allows the user to use raw SQL on the underlying database.

        Note
        _____
        This can cause irreversible damage to the underlying database that can only be fixed by deleting and reconstructing the database.
        """
        cprint.warn('The operation that you are about to perform might damage the local database. Do you wish to continue [y/n]:')
        response = input()
        if(response == 'y'):
            raw_df = pd.read_sql(sql_str, con = self.sql_engine)
            return(raw_df)
        else:
            cprint.info('Operation cancelled.')
            return(None)

    def query_CCM(self, start_date: datetime.date, end_date: datetime.date, **kwargs) -> pd.DataFrame:
        """
        Used to query the merged CRSP/Compustat (CCM) table.

        Parameters
        ___________
        start_date: The starting date of the data query.
        end_date:   The ending date of the data query.
        freq:       The frequency of the data query.
            Choices are:
                * Q: quarterly
                * A: annual

        Keyword Arguments
        __________________
        vars:     list; The variables to query for.
        add_vars: list; Additional variables to query for ontop of the default variables.
        sub_vars: list; Variables to remove from the default variables.
        all_vars: bool; Set to true to query for all variables in the table.
        id_type:  str;  Type of ID used to query for specific assets.
            Choices are:
                * ticker
                * gvkey
                * permno
                * cusip
                * permco
        ids:      list; The ids of type 'id_type' to query for.

        Note
        _____
        The variables that can be queiried for are:
            'gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', 
            'pstk', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'shrcd', 'exchcd', 
            'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev', 'pr2_12', 'pr1_1', 'pr13_60', 'prx2_12', 'prx1_1', 'prx13_60'

        Note
        _____
        If the frequency is quarterly the variables that can be queiried for are the same as the annual file except
            for 'pstkrv' and 'pstkl'.

        Note
        _____
        The defualt variables that are queried for are:
            'date', 'gvkey', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe',  
            'cfp', 'inv', 'op', 'PR2_12', 'PR1_1', 'PR13_60'
        """
        # vars that can be queiried for
        VALID_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'dp', 'year', 'month', 'pr1_1', 'pr2_12', 'pr13_60', 'prx1_1', 'prx2_12', 'prx13_60', 
                      'me', 'ffdate', 'ffyear', 'ffmonth', 'months_in', 'wt', 'dec_me', 'dltt', 'mib', 'revt', 'csho', 'adjex_f', 'act', 'xint', 'pstk', 'txdi', 'gvkey', 'ib', 'xsga', 'dlc', 'ceq', 'che', 'datadate', 'txdc', 'dpc', 'ibc', 
                      'fyear', 'pstkl', 'teq', 'cogs', 'pstkrv', 'lct', 'dpre', 'txditc', 'seq', 'at', 'sale', 'year_end', 'years_in', 'ps', 'be', 'earn', 'profit', 'op', 'inv', 'cf', 'csho_adj', 'd_owcap_adj', 'ac', 'ni_csho_adj', 'nsi', 'ffbm', 
                      'bm', 'ffep', 'ep', 'ffcfp', 'cfp', 'beta']

        # for annual
        #VALID_VARS = ['gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', 
        #                  'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 
        #                  'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev']

        # if no keyword arguments are given then these are the defaults returned
        DEFAULT_VARS = ['date', 'gvkey', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'datadate', 'year_end', 'ffdate', 'prc', 'shrout', 'adjret', 'adjretx', 
                        'me', 'wt', 'dp', 'be', 'bm', 'ffbm', 'ep', 'ffep', 'cfp', 'ffcfp', 'inv', 'op', 'pr2_12', 'pr1_1', 'pr13_60', 'beta', 'ac', 'nsi', 'years_in', 'months_in', 'month', 'ffyear']

        VARS_DATA_TYPE = {'permno': str, 'permco': str, 'ticker': str, 'shrcd': str, 'exchcd': str, 'prc': float, 'shrout': float, 'adjret': float, 'adjretx': float, 'adjcumret': float, 
                          'adjcumretx': float, 'dp': float, 'year': int, 'month': int, 'pr1_1': float, 'pr2_12': float, 'pr13_60': float, 'prx1_1': float, 'prx2_12': float, 'prx13_60': float, 
                          'me': float, 'ffyear': int, 'ffmonth': int, 'months_in': int, 'wt': float, 'dec_me': float, 'dltt': float, 'mib': float, 'revt': float, 'csho': float, 'adjex_f': float, 
                          'act': float, 'xint': float, 'pstk': float, 'txdi': float, 'gvkey': str, 'ib': float, 'xsga': float, 'dlc': float, 'ceq': float, 'che': float, 'txdc': float, 'dpc': float, 'ibc': float, 
                          'fyear': int, 'pstkl': float, 'teq': float, 'cogs': float, 'pstkrv': float, 'lct': float, 'dpre': float, 'txditc': float, 'seq': float, 'at': float, 'sale': float, 'years_in': int, 
                          'ps': float, 'be': float, 'earn': float, 'profit': float, 'op': float, 'inv': float, 'cf': float, 'csho_adj': float, 'd_owcap_adj': float, 'ac': float, 'ni_csho_adj': float, 'nsi': float, 'ffbm': float, 
                          'bm': float, 'ffep': float, 'ep': float, 'ffcfp': float, 'cfp': float, 'beta': float}

        #############################################################################################################################################
        # Query Validation and Error Checking
        #############################################################################################################################################

        # keywrods 'additional_vars' and 'vars' cannot be used simultaneously
        if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')

        # create list of the variables being quireied 
        query_vars = None
        if('vars' in kwargs):
            # variable arguments to query for
            query_vars = kwargs['vars']

            # 'permno' needs to be in the query vars for merging
            if('permno' not in query_vars): query_vars.insert(0, 'permno')

            # add date if people forgot
            if('date' not in query_vars): query_vars.insert(0, 'date')
        else:
            if('add_vars' in kwargs):
                query_vars = DEFAULT_VARS + kwargs['add_vars']
            else:
                query_vars = DEFAULT_VARS
            if('sub_vars' in kwargs):
                query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']]
            
        if('all_vars' in kwargs): query_vars = VALID_VARS

        # make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it)
        all_valid = all(elem in VALID_VARS for elem in query_vars)
        if(not all_valid):
            incorrect_vars = list(set(query_vars) - set(VALID_VARS))
            raise Exception(f'Variables {incorrect_vars} cannot be queried from the combined CRSP/Compustat merged table. The CCM table does not contain all of the variables that are in CRSP and Compustat.')
        
        specific_query = False
        id_type = ''
        ids = []
        if('id_type' in kwargs or 'ids' in kwargs):
            if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
            if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
            specific_query = True
            id_type = kwargs['id_type']
            ids = kwargs['ids']

        ##############################################################################################################################################
        # Load the raw data
        ##############################################################################################################################################

        # read in raw dataframe from local sql database
        raw_df = pd.read_sql(self._CCM_sql_query(start_date, end_date, vars = query_vars, 
                                                 specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)

        ##############################################################################################################################################
        # Clean the raw data
        ##############################################################################################################################################

        # I HATE U SEC 
        if(query_vars is None): 
            raw_df.fyear = raw_df.fyear.astype(float)
        if(not query_vars is None):
            if('fyear' in query_vars): 
                raw_df.fyear = raw_df.fyear.astype(float)

        # get vars in the dataframe
        quried_vars = list(set(list(raw_df.columns)) - set(['date', 'datadate', 'ffdate', 'year_end']))
        vars_dtypes = {}
        for var in quried_vars:
            vars_dtypes[var] = VARS_DATA_TYPE[var]

        # convert to correct data types
        raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d')
        raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d')
        raw_df.ffdate = pd.to_datetime(raw_df.ffdate, format = '%Y-%m-%d')
        raw_df.year_end = pd.to_datetime(raw_df.year_end, format = '%Y-%m-%d')
        raw_df = raw_df.astype(vars_dtypes)

        # replace and python objects 'None' to np.nan
        raw_df = raw_df.fillna(value = np.nan)

        # reset to original variables, drop duplicates, and reset the index
        raw_df = raw_df[query_vars]
        raw_df = raw_df.drop_duplicates()
        raw_df = raw_df.sort_values(by = ['permno', 'date'])
        raw_df = raw_df.reset_index(drop = True)

        # return dataframe
        return(raw_df)

    def query_Compustat(self, start_date: datetime.date, end_date: datetime.date, freq: str, **kwargs) -> pd.DataFrame:
        """
        Used to query the raw Compustat tables.

        Parameters
        ___________
        start_date: The starting date of the data query.
        end_date:   The ending date of the data query.
        freq:       The frequency of the data query.
            Choices are:
                * Q: quarterly
                * A: annual

        Keyword Arguments
        __________________
        vars:     list; The variables to query for.
        add_vars: list; Additional variables to query for ontop of the default variables.
        sub_vars: list; Variables to remove from the default variables.
        all_vars: bool; Set to true to query for all variables in the table.
        id_type:  str;  Type of ID used to query for specific assets.
            Choices are:
                * ticker
                * gvkey
                * permno
                * cusip
                * permco
        ids:      list; The ids of type 'id_type' to query for.

        Note
        _____
        The defualt variables that are queired for from the quarterly file have their names changed to mirror those in the annual
            file. In most cases this means removing a 'q' at the end of the variable name. For example, in the annual file the 
            fiscal year variable is 'fyear' while in the quarterly file the name is 'fyearq'. This name change is done to 
            the dataframe that will be returned in RAM and not to the underlying Compustat table on DISK. The change is done to make 
            it easier to compute the anomally characterisitcs when creating the combined CCM tables.  

        Note
        _____
        By use of the 'add_vars' or 'vars' keyword arguments you can query for the approximately 1000 variables that Compustat tracks.
            To do this you need to know the actual name of the varibale that you want to query for, paying attention to Compustat's
            naming conventions between their annual and quarterly files.
            

        Note
        _____
        The defualt variables that are queried for are if the frequency given is annual:
            'gvkey', 'date', 'fyear', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dp', 'che', 'dlc', 'ceq', 'seq', 
            'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib'

            If the frequency is quarterly it is the same variables excluding 'pstkrv' and 'pstkl'.

        Note
        _____
        There is less error checking in this function compared to the other methods in this class because of the large number of variables
            in Compustat.
        """
        STD_VARS = None
        if(freq == 'A'):
            STD_VARS = ['gvkey', 'datadate', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdc', 'dpc', 'che', 'dlc', 'ceq', 'seq', 'teq', 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'ibc', 'dltt', 'mib', 'ib', 'dp']
        else:
            STD_VARS = ['gvkey', 'datadate', 'tic', 'atq', 'saleq', 'cogsq', 'actq', 'txdiq', 'cshoq', 'lctq', 'txdcy', 'dpcy', 'cheq', 'dlcq', 'ceqq', 'seqq', 'teqq', 'pstkq', 'txditcq', 'xintq', 'xsgaq', 'ibcy', 'dlttq', 'mibq', 'ibq', 'dpq']

        DEFAULT_DTYPES = {'gvkey': str, 'ticker': str, 'at': float, 'sale': float, 'cogs': float, 'act': float, 'txdi': float, 'csho': float, 'lct': float, 'dltt': float, 'mib': float,
                          'txdc': float, 'dpre': float, 'che': float, 'dlc': float, 'ceq': float, 'seq': float, 'teq': float, 'pstk': float, 'txditc': float, 'xint': float, 'xsga': float, 'ibc': float, 'ib': float}

        CREATED_VARS = ['years_in']

        #############################################################################################################################################
        # Query Validation and Error Checking
        #############################################################################################################################################

        if(freq not in ['Q', 'A']): raise Exception('Invlaid frequency given to query_compustat')

        # keywrods 'additional_vars' and 'vars' cannot be used simultaneously
        if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')

        # create list of the variables being quireied 
        query_vars = None
        if('vars' in kwargs):
            # variable arguments to query for
            query_vars = kwargs['vars']

            # 'permno' needs to be in the query vars for merging
            if('gvkey' not in query_vars): query_vars.insert(0, 'gvkey')

            # add date if people forgot
            if('datadate' not in query_vars and 'date' not in query_vars): query_vars.insert(0, 'datadate')
        else:
            if('add_vars' in kwargs):
                query_vars = STD_VARS + kwargs['add_vars']
            else:
                query_vars = STD_VARS
            if('sub_vars' in kwargs):
                sub_vars = ['tic' if elem == 'ticker' else elem for elem in kwargs['sub_vars']]
                query_vars = [elem for elem in query_vars if elem not in sub_vars]

        query_vars = ['datadate' if elem == 'date' else elem for elem in query_vars]
        query_vars = ['tic' if elem == 'ticker' else elem for elem in query_vars]
        query_vars = ['conm' if elem == 'comnam' else elem for elem in query_vars]

        if('all_vars' in kwargs): query_vars = None

        indfmts = kwargs['indfmts'] if('indfmts' in kwargs) else ['INDL']     # default: Industrial, Financial
        datafmts = kwargs['datafmts'] if('datafmts' in kwargs) else ['STD']         # default: Standard
        popsrcs = kwargs['popsrcs'] if('popsrcs' in kwargs) else ['D']              # default: Consolidated
        consols = kwargs['consols'] if('consols' in kwargs) else ['C']              # default: Consolidated

        specific_query = False
        id_type = ''
        ids = []
        if('id_type' in kwargs or 'ids' in kwargs):
            if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
            if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
            specific_query = True
            id_type = kwargs['id_type']
            if(id_type == 'ticker'): id_type = 'tic'
            ids = kwargs['ids']

        query_vars_DB = list(set(query_vars) - set(CREATED_VARS))

        ##############################################################################################################################################
        # Load the raw data
        ##############################################################################################################################################

        # read in raw dataframe from local sql database
        raw_df = pd.read_sql(self._compustat_SQL_query(start_date, end_date, freq, vars = query_vars_DB, 
                                                       indfmt = indfmts, datafmt = datafmts, popsrc = popsrcs, consol = consols,
                                                       specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)


        ##############################################################################################################################################
        # Clean the raw data
        ##############################################################################################################################################

        # rename columns
        raw_df = raw_df.rename(columns = {'tic': 'ticker', 'conm':'comnam'})

        # rename the default columns to match the names from the COMPA_FUNDA
        if(freq == 'Q'):
            # quarterly compustat
            # dont balme me for the different names blame compustat
            raw_df = raw_df.rename(columns = {'atq':'at', 'seqq':'seq', 'ceqq':'ceq', 'teqq':'teq',
                                              'pstkq':'pstk', 'txdcy':'txdc', 'txditcq':'txditc', 'saleq':'sale',
                                              'cogsq':'cogs', 'xintq':'xint', 'xsgaq':'xsga', 'mibq':'mib', 
                                              'ibcy':'ibc', 'txdiq':'txdi', 'dpq':'dpre', 'cshoq':'csho', 'adjex':'adjex_f',
                                              'actq':'act', 'lctq':'lct', 'cheq':'che', 'dlcq':'dlc', 'dlttq': 'dltt', 'ibq': 'ib'})
        else:
            # annual compustat
            # rename columns for consistency
            raw_df = raw_df.rename(columns = {'dp': 'dpre'})

        # make date a datetime.date and align to the end of the year/quarter
        raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d')
        if(freq == 'A'):
            raw_df['year_end'] = raw_df.datadate + YearEnd(0)
        else:
            raw_df['quarter_end'] = raw_df.datadate + QuarterEnd(0)

        # I HATE U S&P
        if('fyear' in query_vars): 
            raw_df.fyear = raw_df.fyear.astype(float)

        # add years in to compustat
        if('years_in' in query_vars):
            raw_df['years_in'] = raw_df.groupby(by = ['gvkey']).cumcount()
                
        # get vars in the dataframe
        quried_vars = list(set(list(raw_df.columns)) - set(['date']))
        vars_dtypes = {}
        for var in quried_vars:
            if(var in DEFAULT_DTYPES):
                vars_dtypes[var] = DEFAULT_DTYPES[var]

        # convert dtypes
        raw_df = raw_df.fillna(value = np.nan)
        raw_df = raw_df.astype(vars_dtypes)

        # sort just for ease of reading
        raw_df = raw_df.drop_duplicates()
        sorting_dims = ['gvkey', 'year_end'] if(freq == 'A') else ['gvkey', 'quarter_end']
        raw_df = raw_df.sort_values(by = sorting_dims)
        raw_df = raw_df.reset_index(drop = True)

        # return the dataframe
        return(raw_df)

    def query_CRSP(self, start_date: datetime.date, end_date: datetime.date, freq: str, adj_stocksplit: bool = True, **kwargs) -> pd.DataFrame:
        """
        Used to query the raw CRSP files. Additonally, variables can be created and stock splits can be adjusted for.

        Parameters
        ___________
        start_date: The starting date of the data query.
        end_date:   The ending date of the data query.
        freq:       The frequency of the data query.
            Choices are:
                * M: quarterly
                * D: annual
        adj_stocksplit: default = True; Whether or not to adjust for a stock split event.

        Keyword Arguments
        __________________
        vars:     list; The variables to query for.
        add_vars: list; Additional variables to query for ontop of the default variables.
        sub_vars: list; Variables to remove from the default variables.
        all_vars: bool; Set to true to query for all variables in the table.
        id_type:  str;  Type of ID used to query for specific assets.
            Choices are:
                    * ticker
                    * gvkey
                    * permno
                    * cusip
                    * permco
        ids:      list; The ids of type 'id_type' to query for.
        exchcds:  list; The exchange codes to use for querying (default: NYSE, NYSE Market (formally AMEX), and NASDAQ)
        shrcds:   list; The share codes to use for querying (default: US Common Stock) 

        Note
        _____
        If the frequency is monthly then the variables that can be quiried for are:
            'date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 
            'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 
            'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 
            'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt'

        Note
        _____
        If the frequency is daily then the variables that can be quiried for are:
            'date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 
            'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 
            'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 
            'nwperm', 'dlamt', 'openprc', 'numtrd'
            
        Note
        _____
        The variabels that are created from CRSP primatives are:
            * 'me':         Market Equity (millions)
            * 'adjret':     Returns adjusted for delisting events
            * 'adjretx':    Returns adjusted for delisting events ex. dividend
            * 'dvd':        Dividend (uses 'adjret' and 'adjretx' to calculate)
            * 'dp':         Dividend-to-Price Ratio 

        Note
        _____
        This function defaults to querying for all companies that are consistent with Famas defintions. That is to say assets with a share code of 10 or 11 
            and an exchange code of 1, 2, or 3.
        """
        # variables that can be queried for
        STD_VARS = None
        if(freq == 'M'):
            STD_VARS = ['date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 
                        'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 
                        'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 
                        'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt']
        else:
            STD_VARS = ['date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 
                        'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 
                        'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 
                        'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt', 'openprc', 'numtrd']

        CREATE_VARS = ['me', 'adjret', 'adjretx', 'dvd', 'dp', 'cumret', 'cumretx', 'adjcumret', 'adjcumretx']

        VALID_VARS = STD_VARS + CREATE_VARS

        # if no keyword arguments are given then these are the defaults returned
        DEFAULT_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'me', 'dp', 'vol']

        # variable data types
        VARS_DATA_TYPE = {'cusip': str, 'permno': str, 'permco' : str, 'comnam': str, 'compno': str, 'ticker': str, 
                          'primexch': str, 'tsymbol': str, 'secstat': str, 'hsiccd': str, 'naics': str, 'siccd': str, 'trdstat': str, 'ncusip': str,
                          'shrcd': str, 'exchcd': str, 'issuno': str, 'hexcd': str, 'shrcls': str, 
                          'ret': float, 'retx': float, 'shrout': float, 'prc': float, 'cfacshr': float, 'cfacpr': float, 
                          'bidlo': float, 'bid': float, 'ask': float, 'askhi': float, 'spread': float, 'altprc': float, 'vol': float,
                          'dlstdt': str, 'dlstcd': str, 'nwperm': str, 'nwcomp': str, 'nextdt': str, 'dlamt': float, 'dlretx': float, 'dlprc': float, 
                          'dlpdt': str, 'dlret': float, 'acperm': str, 'accomp': str, 'me': float, 'adjret': float, 'adjretx': float, 'dvd': float, 
                          'adjdvd': float, 'dp': float, 'openprc': float, 'numtrd': float, 'cumret': float, 'cumretx': float, 'adjcumret': float, 'adjcumretx': float}

        #############################################################################################################################################
        # Query Validation and Error Checking
        #############################################################################################################################################

        if(freq not in ['D', 'M']): raise Exception('Invlaid frequency given to query_CRSP')

        # keywrods 'additional_vars' and 'vars' cannot be used simultaneously
        if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')

        # create list of the variables being quireied 
        query_vars = None
        if('vars' in kwargs):
            # variable arguments to query for
            query_vars = kwargs['vars']

            # 'permno' needs to be in the query vars for merging
            if('permno' not in query_vars): query_vars.insert(0, 'permno')

            # add date if people forgot
            if('date' not in query_vars): query_vars.insert(0, 'date')
        else:
            if('add_vars' in kwargs):
                query_vars = DEFAULT_VARS + kwargs['add_vars']
            else:
                query_vars = DEFAULT_VARS
            if('sub_vars' in kwargs):
                query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']]
            

        if('all_vars' in kwargs): query_vars = VALID_VARS

        # used for dataframe formatting at the end
        og_vars = query_vars.copy()

        # make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it)
        all_valid = all(elem in VALID_VARS for elem in query_vars)
        if(not all_valid):
            incorrect_vars = list(set(query_vars) - set(VALID_VARS))
            raise Exception(f'Variables {incorrect_vars} cannot be queried from CRSP.')

        # always adjust for stock splits (can disable this)
        if(adj_stocksplit):
            if('prc' not in query_vars): query_vars.append('prc')
            if('cfacpr' not in query_vars): query_vars.append('cfacpr')
            if('shrout' not in query_vars): query_vars.append('shrout')
            if('cfacshr' not in query_vars): query_vars.append('cfacshr')

        # make sure if created variables are being queiried for then add the variables needed to create them
        if('me' in query_vars):
            if('prc' not in query_vars): query_vars.append('prc')
            if('shrout' not in query_vars): query_vars.append('shrout')

        if('dp' in query_vars):
            if('dvd' not in query_vars): query_vars.append('dvd')

        if('dvd' in query_vars):
            if('adjret' not in query_vars): query_vars.append('adjret')
            if('adjretx' not in query_vars): query_vars.append('adjretx')
            if('prc' not in query_vars): query_vars.append('prc')

        if('adjret' in query_vars):
            if('ret' not in query_vars): query_vars.append('ret')
            if('dlret' not in query_vars): query_vars.append('dlret')

        if('adjretx' in query_vars):
            if('retx' not in query_vars): query_vars.append('retx')
            if('dlretx' not in query_vars): query_vars.append('dlretx')

        if('cumret' in query_vars):
            if('ret' not in query_vars): query_vars.append('ret')

        if('cumretx' in query_vars):
            if('retx' not in query_vars): query_vars.append('retx')

        if('adjcumret' in query_vars):
            if('ret' not in query_vars): query_vars.append('ret')
            if('dlret' not in query_vars): query_vars.append('dlret')
            
        if('adjcumretx' in query_vars):
            if('retx' not in query_vars): query_vars.append('retx')
            if('dlretx' not in query_vars): query_vars.append('dlretx')


        exchcds = kwargs['exchcds'] if('exchcds' in kwargs) else [1, 2, 3] # default: NYSE, NYSE MKT, NASDAQ
        shrcds = kwargs['shrcds'] if('shrcds' in kwargs) else [10, 11]     # default: US-based common stock

        specific_query = False
        id_type = ''
        ids = []
        if('id_type' in kwargs or 'ids' in kwargs):
            if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
            if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
            specific_query = True
            id_type = kwargs['id_type']
            ids = kwargs['ids']

        # created vars are not in the table so remove them
        db_vars = [var for var in query_vars if var not in CREATE_VARS]

        ##############################################################################################################################################
        # Load the raw data
        ##############################################################################################################################################

        # read in raw dataframe from local sql database
        raw_df = pd.read_sql(self._CRSP_SQL_query(start_date, end_date, freq, vars = db_vars, exchcds = exchcds, shrcds = shrcds, 
                                                  specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)

        ##############################################################################################################################################
        # Clean the raw data
        ##############################################################################################################################################

        # I HATE U SEC
        DOWNCAST_VARS = ['permno', 'permco', 'exchcd', 'issuno', 'hexcd', 'shrcd', 'compno', 'hsiccd', 'naics', 'siccd', 'acperm', 'accomp', 'dlstcd', 'nwcomp', 'nwperm']
        for var in DOWNCAST_VARS:
            if(var in query_vars):
                raw_df[var] = raw_df[var].astype('Int64')

        # get vars in the dataframe
        quried_vars = list(set(list(raw_df.columns)) - set(['altprcdt', 'date', 'nameendt', 'namedt', 'dlstdt', 'nextdt', 'dlpdt']))
        vars_dtypes = {}
        for var in quried_vars:
            vars_dtypes[var] = VARS_DATA_TYPE[var]

        # convert dates to datetime.dates and align to end of month 
        raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d')
        if(freq == 'M'): raw_df.date += MonthEnd(0)

        if('altprcdt' in query_vars): raw_df.altprcdt = pd.to_datetime(raw_df.altprcdt, format = '%Y-%m-%d')
        if('nameendt' in query_vars): raw_df.nameendt = pd.to_datetime(raw_df.nameendt, format = '%Y-%m-%d')
        if('namedt' in query_vars): raw_df.namedt = pd.to_datetime(raw_df.namedt, format = '%Y-%m-%d')
        if('dlstdt' in query_vars): raw_df.dlstdt = pd.to_datetime(raw_df.dlstdt, format = '%Y-%m-%d')
        if('nextdt' in query_vars): raw_df.nextdt = pd.to_datetime(raw_df.nextdt, format = '%Y-%m-%d')
        if('dlpdt' in query_vars): raw_df.dlpdt = pd.to_datetime(raw_df.dlpdt, format = '%Y-%m-%d')

        # make sure that the data is the correct type
        raw_df = raw_df.astype(vars_dtypes)

        # replace and python objects 'None' to np.nan
        raw_df = raw_df.fillna(value = np.nan)

        # adjust for stock splits
        if(adj_stocksplit):
            raw_df.prc /= raw_df.cfacpr
            raw_df.shrout *= raw_df.cfacshr

        # Market Equity. Market equity (size) is price times shares outstanding. Price and shares outstanding from CRSP.
        # SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html
        if('me' in query_vars):
            raw_df['me'] = raw_df.prc.abs() * raw_df.shrout
            # convert market equity to $millions
            raw_df.me /= 1e3

        # adjust for delisting return
        if('adjret' in query_vars):
            raw_df.dlret = raw_df.dlret.fillna(value = 0.0)
            raw_df['adjret'] = ((1 + raw_df.ret) * (1 + raw_df.dlret)) - 1
        if('adjretx' in query_vars):
            raw_df.dlretx = raw_df.dlretx.fillna(value = 0.0)
            raw_df['adjretx'] = ((1 + raw_df.retx) * (1 + raw_df.dlretx)) - 1

        # create dividends paid using 'adjret' and 'adjretx' then 'ret' and 'retx' in that order
        if('adjret' in query_vars and 'adjretx' in query_vars):
            raw_df['dvd'] = (raw_df.adjret - raw_df.adjretx) * raw_df.groupby(['permco'])['prc'].shift(1).abs()

        # create cummulative returns
        if('cumret' in query_vars):
            raw_df['1+ret'] = 1 + raw_df.ret
            raw_df['cumret'] = raw_df.groupby(by = ['permno'])['1+ret'].cumprod()
            raw_df = raw_df.drop(columns = ['1+ret'])

        if('cumretx' in query_vars):
            raw_df['1+retx'] = 1 + raw_df.ret
            raw_df['cumretx'] = raw_df.groupby(by = ['permno'])['1+retx'].cumprod()
            raw_df = raw_df.drop(columns = ['1+retx'])

        if('adjcumret' in query_vars):
            raw_df['1+adjret'] = 1 + raw_df.ret
            raw_df['adjcumret'] = raw_df.groupby(by = ['permno'])['1+adjret'].cumprod()
            raw_df = raw_df.drop(columns = ['1+adjret'])

        if('adjcumret' in query_vars):
            raw_df['1+adjretx'] = 1 + raw_df.ret
            raw_df['adjcumretx'] = raw_df.groupby(by = ['permno'])['1+adjretx'].cumprod()
            raw_df = raw_df.drop(columns = ['1+adjretx'])

        # Dividend Yield. The dividend yield used to form portfolios in June of year t is the total dividends paid from July of t-1 
                # to June of t per dollar of equity in June of t. The dividend yield is computed using the with and without dividend returns 
                # from CRSP, as described in Fama and French, 1988, “Dividend yields and expected stock returns,” Journal of Financial Economics 25.
                # SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html
        # NOTE: Following Fama Fench the dividend price ratio uses the last year of dividends paid out if possible with a minimum 7 months.
        if('dp' in query_vars):
            if(freq == 'M'):
                min_periods = 7
                window = 12
            else:
                min_periods = 147 # 252 days / 12 months * 7 months
                window = 252
            raw_df['cumdvd'] = raw_df.groupby(['permno'])['dvd'].rolling(min_periods = min_periods, window = window).sum().reset_index(level = 'permno')[['dvd']]
            raw_df['dp'] = raw_df.cumdvd / raw_df.prc.abs()
            raw_df.dp = np.where((raw_df.dp.isnull()) | (raw_df.dp < 0), np.nan, raw_df.dp)
            raw_df = raw_df.drop(columns = ['cumdvd'])

        # reset to original variables, drop duplicates, and reset the index
        raw_df = raw_df[og_vars]
        raw_df = raw_df.drop_duplicates()
        raw_df = raw_df.sort_values(by = ['permno', 'date'])
        raw_df = raw_df.reset_index(drop = True)

        # return the raw dataframe and path where it was saved
        return(raw_df)

    def query_riskfree(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) -> pd.DataFrame:
        """
        Query the risk-free rate from the Fama-French library on local WRDS. This rate is equivalent to the 1 month T-Bill rate.

        Parameters
        ___________
        start_date: datetime.date\n
            Starting date of the dataset being queried.

        end_date: datetime.date\n
            Ending date of the dataset being queried.

        obs_freq: str\n
            The observational frequency of the CRSP database being queried.
                Choices are:
                    * 'D' : daily
                    * 'M' : monthly
                    * 'A' : annually

        Returns
        ________
        full_df: pd.DataFrame\n
            Risk-free rate data.

        Note
        _____
        The dataframe returned makes adjustments for NYSE holidays during compounding.

        Note
        _____
        List of queried CRSP variables:\n
            * date : Date of observation
            * rf   : Risk-free rate
        """
        # Since monthly observations have a date starting on the 1st of each month, then for any 'start_date' that doesn't
        # coincide w/ the 1st of any month, we adjust it so it does and the query pulls the monthly observation of interest.
        if(obs_freq in ['M', 'A'] and start_date != (start_date + MonthBegin(-1)).date()):
            start_date = (start_date + MonthBegin(-1)).date()

        # load in dataframe
        raw_df = pd.read_sql(self._rf1m_SQL_query(start_date, end_date, obs_freq), con = self.sql_engine)

        # convert dates to datetimes
        raw_df['date'] = pd.to_datetime(raw_df['date'])

        # Convert trading dates to end-of-period if 'freq' does not pertain to daily frequency.
        if(obs_freq == 'M'):
            raw_df['date'] = raw_df['date'] + MonthEnd(0)
        elif(obs_freq == 'A'):
            raw_df['date'] = raw_df['date'] + YearEnd(0)

        # return the raw dataframe
        return(raw_df)

    def query_link_table(self) -> pd.DataFrame:
        """
        Query the CRSP/Compustat (CCM) Merged Linking Table needed to merge CRSP securities to
            Compustat companies on permno and gvkey.

        Returns
        ________
        raw_df: pd.DataFrame\n
            The raw dataframe pulled from local WRDS database.

        Note
        _____
        Currently this function only works if a local copy of the WRDS database exits w/ the CCM Linktable.
        """
        sql_str = """
                    SELECT gvkey, lpermno as permno, lpermco as permco, linktype, linkprim, linkdt, linkenddt
                    FROM CRSP_CCMXPF_LINKTABLE
                    WHERE substr(linktype, 1, 1) = 'L'
                    AND (linkprim = 'C' or linkprim = 'P')
                  """

        # read in raw dataframe from local database
        raw_df = pd.read_sql(sql_str, con = self.sql_engine)

        # convert permno and permco to string
        raw_df.permco = pd.to_numeric(raw_df.permco, downcast = 'integer')
        raw_df.permno = pd.to_numeric(raw_df.permno, downcast = 'integer')

        # convert identifiers to strings
        raw_df.gvkey = raw_df.gvkey.astype(str)
        raw_df.permno = raw_df.permno.astype(str)
        raw_df.permco = raw_df.permco.astype(str)

        # if linkenddt is missing the set to todays date
        raw_df.linkenddt = raw_df.linkenddt.fillna(pd.to_datetime('today').date())

        # convert to datetimes
        raw_df.linkdt = raw_df.linkdt.astype('datetime64[ns]')
        raw_df.linkenddt = raw_df.linkenddt.astype('datetime64[ns]')

        # return the raw dataframe
        return(raw_df)

# ----------------------------------------------------------------------------------------------------------------------------
# INTERNAL METHODS (class <QueryWRDS>)
#
# These are internal methods and should only be called within this class. Functionality and accuracy of these methods cannot
# garunteed if they are called outside of this class.
# ----------------------------------------------------------------------------------------------------------------------------

    # INTERNAL METHOD
    def _list_to_sql_str(self, lst: list, table: str = None) -> str:
        res = ''
        for var in lst:
            if(table is None):
                res += f'\'{var}\', '
            else:
                res += f'{table}.{var}, '
        res = res[:-2]
        return(res)

    # INTERNAL METHOD
    def _CCM_sql_query(self, start_date: datetime.date, end_date: datetime.date, vars: list, specific_query: bool, id_type: str, ids: list):
        sql_str = ''
        table = 'CCM'

        # convert date time object to strings for the SQL query
        start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\''
        end_date_str  = '\'' + end_date.strftime('%Y-%m-%d') + '\''

        # create argument string
        sql_str += f'SELECT {self._list_to_sql_str(vars, table)} FROM {table} WHERE date BETWEEN {start_date_str} AND {end_date_str}'

        # additional subsetting
        if(specific_query): sql_str += f' AND {id_type} IN ({self._list_to_sql_str(ids)})'

        return(sql_str)
    
    
    # INTERNAL METHOD
    def _compustat_SQL_query(self, start_date: datetime.date, end_date: datetime.date, freq: str, vars: list, indfmt: list, datafmt: list, popsrc: list, consol: list, specific_query: bool, id_type: str, ids: list) -> str:
        """
        INTERNAL METHOD: Create SQL string used to query the WRDS Compustat database.

        Parameters
        ___________
        start_date: Starting date for the dataset queried.
        end_date: Ending date for the dataset queried.
        freq: The observational frequency of the query.
                Choices are:
                    * 'Q' : quarterly
                    * 'A' : annual

        Returns
        ________
        sql_str: String containing the SQL code used to query the specified Compustat database beased on
                    the start and end date and frequency given.
        """

        sql_str = ''
        table = f'COMPA_FUND{freq}'

        # convert date time object to strings for the SQL query
        start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\''
        end_date_str  = '\'' + end_date.strftime('%Y-%m-%d') + '\''

        # create argument string
        var_str = None
        if(vars is None):
            var_str = '*'
        else:
            var_str = self._list_to_sql_str(vars, table)
        sql_str += f'SELECT {var_str} FROM {table} WHERE datadate BETWEEN {start_date_str} AND {end_date_str}'

        # additional subsetting
        if(len(indfmt) != 0): sql_str += f' AND COMPA_FUND{freq}.indfmt IN ({self._list_to_sql_str(indfmt)})'
        if(len(datafmt) != 0): sql_str += f' AND COMPA_FUND{freq}.datafmt IN ({self._list_to_sql_str(datafmt)})'
        if(len(popsrc) != 0): sql_str += f' AND COMPA_FUND{freq}.popsrc IN ({self._list_to_sql_str(popsrc)})'
        if(len(consol) != 0): sql_str += f' AND COMPA_FUND{freq}.consol IN ({self._list_to_sql_str(consol)})'
        if(specific_query): sql_str += f' AND COMPA_FUND{freq}.{id_type} IN ({self._list_to_sql_str(ids)})'

        return(sql_str)

    # INTERNAL METHOD
    def _CRSP_SQL_query(self, start_date: datetime.date, end_date: datetime.date, freq: str, vars: list, exchcds: list, shrcds: list, specific_query: bool, id_type: str, ids: list) -> str:
        """
        INTERNAL METHOD: Create SQL string used to query the local WRDS CRSP monthly database.

        Parameters
        ___________
        start_date: Starting date for the dataset queried.

        end_date: Ending date for the dataset queried.

        freq: Observational frequency.
            Choices are:
                * 'D' : daily
                * 'M' : monthly

        Returns
        ________
        sql_str : str\n
            String containing the SQL code used to query the specified CRSP database beased on
                the observational frequency and WRDS update frequency of the CRSP database.

        Note
        _____
        Additonal to pulling the daily stock file (dsf) or the monthly stock file (msf) we also pull
            the daily or monthly stock events names file for the exchange code and the share code.
        """

        # table to query from
        sql_str = ''
        table = f'CRSP_{freq}'

        # convert date time object to strings for the SQL query
        start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\''
        end_date_str  = '\'' + end_date.strftime('%Y-%m-%d') + '\''

        # create argument string
        var_str = self._list_to_sql_str(vars, table)
        sql_str += f'SELECT {var_str} FROM {table} WHERE date BETWEEN {start_date_str} AND {end_date_str}'

        # additional subsetting
        if(len(exchcds) != 0): sql_str += f' AND exchcd in ({self._list_to_sql_str(exchcds)})'
        if(len(shrcds) != 0): sql_str += f' AND shrcd in ({self._list_to_sql_str(shrcds)})'
        if(specific_query): sql_str += f' AND {id_type} IN ({self._list_to_sql_str(ids)})'

        return(sql_str)

    # INTERNAL METHOD
    def _rf1m_SQL_query(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) -> str:
        """
        INTERNAL METHOD: Create SQL string used to query the Fama-French risk free rate
                            listed on WRDS CRSP in the FF library. This rate is the
                            1 month T-Bill rate.

        Parameters
        ___________
        start_date: str\n
            Starting date for the data being queried.

        end_date: str\n
            Ending date for the data being queried.

        obs_freq: str\n
            The observational frequency of the CRSP delisting database being queried.
                Choices are:
                    * 'D' : daily
                    * 'M' : monthly
                    * 'A' : annual

        Returns
        ________
        sql_str : str\n
            String containing the SQL code used to query the risk free rate in the
                Fama-French (FF) library on CRSP/WRDS database.

        Note
        _____
        Depending on the observational frequency (obs_freq) given the compounding of the
            risk-free rate changes.
        """
        # convert date time object to strings for the SQL query
        start_date_str = start_date.strftime('%Y-%m-%d')
        end_date_str  = end_date.strftime('%Y-%m-%d')

        # Depending on the frequency supplied the compounding changes
        if(obs_freq == 'D'):
            sql_1 = 'strftime(\'%d\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%d\', date) AS diff'
            sql_2 = 'rf AS cumrf'
            library = 'FF_FACTORS_DAILY'
        elif(obs_freq == 'M'):
            sql_1 = 'strftime(\'%m\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%m\', date) AS diff'
            sql_2 = 'rf AS cumrf'
            library = 'FF_FACTORS_MONTHLY'
        elif(obs_freq == 'A'):
            sql_1 = 'strftime(\'%Y\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%Y\', date) AS diff'
            sql_2 = 'EXP(SUM(LN(1 + rf)) OVER (PARTITION BY strftime(\'%Y\', date))) - 1 AS cumrf'
            library = 'FF_FACTORS_MONTHLY'
        else:
            cprint.fatal('No valid observational frequency given.', interrupt = True)

        sql_dic = {'sql_1' : sql_1, 'sql_2' : sql_2, 'library' : library, 'start_date' : '\'' + start_date_str + '\'', 'end_date' : '\'' + end_date_str + '\''}
        sql_str = """
                    SELECT date, rf
                    FROM (
                        SELECT date, {0}, rf, {1}
                        FROM {2}
                        WHERE date BETWEEN {3} AND {4}
                    ) AS crsp_rf
                    WHERE diff != 0 OR diff IS NULL
                  """.format(sql_dic['sql_1'], sql_dic['sql_2'], sql_dic['library'], sql_dic['start_date'], sql_dic['end_date'])
        return(sql_str)

Classes

class QueryWRDS (WRDS_username: str, local_db_path: pathlib.Path = None, update_all_tables: bool = False, tables_to_update: list = [], update_tolerance: int = 3)

Initalizers the QueryWRDS class. During this process all of the files need from WRDS are downloaded to create a local SQL database. Additionally, combined files are created. The first files that are created are CRSP_M (monthly) and CRSP_D (daily). These tables contain all of the variables in the stock file, names file, and delisting file. Additionally, CCM file is created; this file contain the most commonly used CRSP varaiables and the Compustat variables used to make the standard Fama-French anomaly characteristics, the standard anomaly characteristics are created

Parameters


WRDS_username: Personal WRDS username.

local_db_path: default = None; Location to create and read from the local SQL database.

update_all_tables: default = False; If true the local database is deleted and recreated by downloading form WRDS.

tables_to_update: default = []; List of tables to update from WRDS.

update_tolerance: default = 3; Number of quarters the tables can be out of date.

Note


If no 'local_db_path' is given then the WRDS database is created in the current directory

Note


The table names in the local SQL database mirror those found on WRDS but with underscores replcaing periods. Thus, when updating tables need to use the local names of the tables (i.e.) CRSP_MSF instead of CRSP.MSF

Note


Anomally characterisitcs created: * bm: Book-to-Market * pe: Price-to-Earnings * cfp: Cashflow-to-Price * inv: Investment * op: Operating Profitablility

Expand source code
class QueryWRDS:

    # TODO: 
    #       (1) add more detail to string repr
    #       (2) setup update reminder if information is over a year out of date
    #       (3) blocking so it will work on other computers

    def __init__(self, WRDS_username: str, local_db_path: pathlib.Path = None, update_all_tables: bool = False, tables_to_update: list = [], update_tolerance: int = 3) -> None:
        """
        Initalizers the QueryWRDS class. During this process all of the files need from
            WRDS are downloaded to create a local SQL database. Additionally, combined files
            are created. The first files that are created are CRSP_M (monthly) and CRSP_D (daily). These
            tables contain all of the variables in the stock file, names file, and delisting 
            file. Additionally, CCM file is created; this 
            file contain the most commonly used CRSP varaiables and the Compustat variables
            used to make the standard Fama-French anomaly characteristics, the standard 
            anomaly characteristics are created

        Parameters
        ___________
        WRDS_username: Personal WRDS username.

        local_db_path: default = None; Location to create and read from the local SQL database.

        update_all_tables: default = False; If true the local database is deleted and recreated 
            by downloading form WRDS.

        tables_to_update: default = []; List of tables to update from WRDS.

        update_tolerance: default = 3; Number of quarters the tables can be out of date.

        Note
        _____
        If no 'local_db_path' is given then the WRDS database is created in the current directory

        Note
        _____
        The table names in the local SQL database mirror those found on WRDS but with underscores
            replcaing periods. Thus, when updating tables need to use the local names of the tables
            (i.e.) CRSP_MSF instead of CRSP.MSF

        Note
        _____
        Anomally characterisitcs created:
            * bm:   Book-to-Market
            * pe:   Price-to-Earnings
            * cfp:  Cashflow-to-Price
            * inv:  Investment
            * op:   Operating Profitablility
        """
        # list of tables from WRDS to make local
        WRDS_TABLES = ['FF.FACTORS_DAILY',          # FamaFrench Factors daily (used for risk-free rate) 
                       'FF.FACTORS_MONTHLY',        # FamaFrench Factors monthly (used for risk-free rate)
                       'CRSP.CCMXPF_LINKTABLE',     # CCM link table used to merge CRSP and Compustat
                       'CRSP.MSEDELIST',            # CRSP monthly delist events
                       'CRSP.MSF',                  # CRSP monthly stock file
                       'CRSP.MSENAMES',             # CRSP monthly event file
                       'COMPA.FUNDA',               # Compustat annual observations
                       'COMPA.FUNDQ']               # Compustat quarterly observations            


        # list of created tables
        CREATED_TABLES = ['CRSP_M',     # CRSP monthly file (i.e. merged CRSP.MSF, CRSP.MSENAMES, CRSP.MSEDELIST) 
                          'CCM']        # CRSP/Compustat merged file 

        # set class attributes
        self.WRDS_username = WRDS_username
        if(local_db_path is None):
            # set location to be in the current directory
            self.local_db_path = pathlib.Path('WRDS.db')
        else:
            # user specified loaction
            self.local_db_path = local_db_path

        self.update_all_tables = update_all_tables
        self.tables_to_update = tables_to_update
        self.update_tolerance = update_tolerance
        self.today_date = datetime.date.today().strftime('%Y-%m-%d')

        # used to update all tables
        if(self.update_all_tables and self.local_db_path.exists()):
            cprint.warn('Updating the tables in the local database. This process could take a long time...')
            os.remove(self.local_db_path)

        # create sql engine
        self.sql_engine = sqlalchemy.create_engine('sqlite:///' + str(self.local_db_path))

        # list of current tables
        # check to see if all required tables are present, if not load the ones that are missing
        inspect = sqlalchemy.inspect(self.sql_engine)
        self.curr_tables = inspect.get_table_names()

        # delete tables that should be updated
        for table_name in self.tables_to_update:
            if(table_name in self.curr_tables):
                with self.sql_engine.connect() as conn:
                    if(table_name not in CREATED_TABLES):
                        _ = conn.execute(f"""DROP TABLE {table_name}""")
                        # drop combo files and remake 
                        for cr_table in CREATED_TABLES:
                            _ = conn.execute(f'DROP TABLE {cr_table}')
                    else:
                        # only drop the created table and remake
                        _ = conn.execute(f'DROP TABLE {table_name}')

        table_names = [name.replace('.', '_') for name in WRDS_TABLES] # local table names

        # check CSV directory for files to include
        CSV_directory = self.local_db_path.parent / 'CSVtoSQL'
        for csvfile in os.listdir(CSV_directory):
            f = os.path.join(CSV_directory, csvfile)
            if(os.path.isfile(f)):
                filepath = pathlib.Path(f)
                tablename = filepath.name.strip('.csv')
                if(tablename in self.curr_tables): continue
                s = time.time()
                cprint.info(f'Adding {filepath.name} to SQL database {self.local_db_path.name}...')
                subprocess.call(['sqlite3', f'{self.local_db_path}', '.mode csv', f'.import {filepath} {tablename}', '.mode columns'])
                e = time.time()
                cprint.info(f'Finished {filepath.name}: {round(e - s, 3)}s')

        # update current tables
        self.curr_tables = inspect.get_table_names()

        # read in the data from WRDS
        if(not all(elem in self.curr_tables for elem in table_names)):
            missing_tables = list(set(table_names) - set(inspect.get_table_names()))
            cprint.warn(f'The following tables are missing from the local database: {missing_tables}. Querying WRDS to add them to the local database.')
            cprint.info('Connecting to WRDS...')
            self.WRDS_db = wrds.Connection(username = self.WRDS_username)
            for table_name in missing_tables:
                table = table_name.replace('_', '.', 1)
                print('-------------------------------------')
                cprint.info(f'Starting {table}')
                s = time.time()
                sql_str = '''SELECT * FROM ''' + table
                # download the data to a dataframe
                df = self.WRDS_db.raw_sql(sql_str)
                cprint.ok(f'Dataframe in memory: {asizeof.asizeof(df) / (10 ** 9)}GB')
                # add end of month column for CRSP_MSEDELIST
                if(table_name == 'CRSP_MSEDELIST'): df['date'] = df.dlstdt + MonthEnd(0)
                if(table_name == 'CRSP_DSEDELIST'): df['date'] = df.dlstdt # create date column for merging
                # write the dataframe to the local sql database
                df.to_sql(table_name, con = self.sql_engine, if_exists = 'replace', index = False)
                del df
                e = time.time()
                cprint.info(f'Finished {table}: {round(e - s, 3)}s')
                print('-------------------------------------\n')
            cprint.info('Raw WRDS files have been added to the local databse.')

        if('CRSP_M' not in self.curr_tables):
            cprint.info(f'Creating combined data table CRSP_M...')
            sf_df = pd.read_sql(f"""SELECT * FROM CRSP_MSF LIMIT 1""", con = self.sql_engine)
            names_df = pd.read_sql(f"""SELECT * FROM CRSP_MSENAMES LIMIT 1""", con = self.sql_engine)
            delsit_df = pd.read_sql(f"""SELECT * FROM CRSP_MSEDELIST LIMIT 1""", con = self.sql_engine)
            vars_to_select = ''
            for var in list(sf_df.columns):
                vars_to_select += f'CRSP_MSF.{var}, '
            for var in list(set(list(names_df.columns)) - set(list(sf_df.columns))):
                vars_to_select += f'CRSP_MSENAMES.{var}, '
            for var in list(set(list(delsit_df.columns)) - set(list(sf_df.columns)) - set(list(names_df.columns))):
                    vars_to_select += f'CRSP_MSEDELIST.{var}, '
            vars_to_select = vars_to_select[:-2]
            sql_dic = {'vars': vars_to_select}
            sql_str = '''CREATE TABLE CRSP_M AS 
                         SELECT {0} FROM CRSP_MSF 
                         LEFT JOIN CRSP_MSENAMES ON CRSP_MSF.permno = CRSP_MSENAMES.permno AND CRSP_MSENAMES.namedt <= CRSP_MSF.date AND CRSP_MSF.date <= CRSP_MSENAMES.nameendt 
                         LEFT JOIN CRSP_MSEDELIST ON CRSP_MSF.permno = CRSP_MSEDELIST.permno AND CRSP_MSF.date = CRSP_MSEDELIST.date'''.format(sql_dic['vars'])
            with self.sql_engine.connect() as conn:
                _ = conn.execute(sql_str)

            cprint.info('Combined CRSP tables have been created.')

        # create merged CRSP and Compustat table
        if('CCM' not in self.curr_tables):
            table = 'CCM'
            cprint.info(f'Creating combined CRSP and Compustat table: {table}')

            start_date = datetime.date(1900, 6, 30)
            end_date = datetime.date(2100, 6, 30)

            # Compustat -------------------------------------------------------------------------------------------
    
            COMP_df = self.query_Compustat(start_date, end_date, 'A', sub_vars = ['ticker'], add_vars = ['years_in', 'fyear', 'revt', 'adjex_f'])
            COMP_df['year'] = COMP_df.datadate.dt.year

            # create preferrerd stock
            COMP_df['ps'] = np.where(COMP_df.pstkrv.isnull(), COMP_df.pstkl, COMP_df.pstkrv)
            COMP_df.ps = np.where(COMP_df.ps.isnull(), COMP_df.pstk, COMP_df.ps)
            COMP_df.ps = np.where(COMP_df.ps.isnull(), 0, COMP_df.ps)
            COMP_df.txditc = COMP_df.txditc.fillna(0)

            # create book equity
            COMP_df['be'] = np.where(COMP_df.fyear < 1993, COMP_df.seq + COMP_df.txditc - COMP_df.ps, COMP_df.seq - COMP_df.ps) 

            # earnings
            COMP_df['earn'] = np.where(~COMP_df.ib.isnull(), COMP_df.ib, np.nan)

            # operating profitability
            COMP_df['xp_allnan'] = (COMP_df.cogs.isnull()) & (COMP_df.xsga.isnull()) & (COMP_df.xint.isnull())
            COMP_df['profit'] = COMP_df.revt - COMP_df.cogs.fillna(0) - COMP_df.xint.fillna(0) - COMP_df.xsga.fillna(0)
            COMP_df['op'] = np.where(COMP_df.be + COMP_df.mib != 0, COMP_df.profit / (COMP_df.be + COMP_df.mib.fillna(0)), np.nan)
            COMP_df.op = np.where(((~COMP_df.op.isnull()) & (~COMP_df.revt.isnull()) & (~COMP_df.xp_allnan)), COMP_df.op, np.nan)

            # NOTE: Compustat data yields gross outliers in 'op' w/ ratios as large as '1,000'.
            #       To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
            #       values for 'op' outside the 99th percentile are set to missing.
            COMP_df.op = np.where((COMP_df.op <= COMP_df.op.quantile(0.99)), COMP_df.op, np.nan)

            # investment
            try:
                COMP_df['inv'] = np.log(COMP_df['at']) - np.log(COMP_df.groupby(by = ['gvkey'])['at'].shift(1))
            except FloatingPointError:
                COMP_df['inv'] = (COMP_df['at'] / COMP_df.groupby(by = ['gvkey'])['at'].shift(1)) - 1
            COMP_df.inv = np.where(~COMP_df.inv.isnull(), COMP_df.inv, np.nan)

            # NOTE: Compustat data yields gross outliers in 'inv' w/ percentages as low as '-100%' and as large as '10,000%'.
            #       These outliers are pervasive on the left tail of the distribution.
            #       To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
            #       values for 'inv' outside [15th, 99th] percentiles are winsorized.
            COMP_df.inv = np.where((COMP_df.inv.quantile(0.15) <= COMP_df.inv), COMP_df.inv, COMP_df.inv.quantile(0.15))
            COMP_df.inv = np.where((COMP_df.inv <= COMP_df.inv.quantile(0.99)), COMP_df.inv, COMP_df.inv.quantile(0.99))

            # cash flow
            COMP_df['cf'] = COMP_df.ib + COMP_df.txdi.fillna(0) + COMP_df.dpre.fillna(0)
            COMP_df.cf = np.where(~COMP_df.cf.isnull(), COMP_df.cf, np.nan)

            # accruals
            COMP_df['csho_adj'] = np.where((COMP_df.csho * COMP_df.adjex_f > 0), COMP_df.csho * COMP_df.adjex_f, np.nan)
            COMP_df['owcap_adj'] = ((COMP_df.act - COMP_df.che) - (COMP_df.lct.fillna(0) - COMP_df.dlc.fillna(0))) / COMP_df.csho_adj
            COMP_df['d_owcap_adj'] = COMP_df.owcap_adj - COMP_df.groupby(by = ['gvkey'])['owcap_adj'].shift(1)
            COMP_df['ac'] = np.where(~COMP_df.csho_adj.isnull(), COMP_df.d_owcap_adj / (COMP_df.be / COMP_df.csho_adj), np.nan)

            # NOTE: Compustat data yields gross outliers in 'ac' for June of each year {t} w/ ratios as low as '-200' and as large as '200'.
            #       To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
            #       values for 'ac' less than '-200' and values for 'ac' larger than '200' are set to missing.
            COMP_df.ac = np.where(-200 <= COMP_df.ac, COMP_df.ac, np.nan)
            COMP_df.ac = np.where(COMP_df.ac <= 200, COMP_df.ac, np.nan)

            # net shares issused
            COMP_df['ni_csho_adj'] = np.where(COMP_df.csho * COMP_df.adjex_f > 0, COMP_df.csho * COMP_df.adjex_f, np.nan)
            try:
                COMP_df['nsi'] = np.log(COMP_df.ni_csho_adj) - np.log(COMP_df.groupby(by = ['gvkey'])['ni_csho_adj'].shift(1))
            except FloatingPointError:
                COMP_df['nsi'] = (COMP_df.ni_csho_adj / COMP_df.groupby(by = ['gvkey'])['ni_csho_adj'].shift(1)) - 1
            COMP_df.nsi = np.where(~COMP_df.nsi.isnull(), COMP_df.nsi, np.nan)

            # NOTE: Compustat data yields outliers in 'ni' w/ ratios as large as '20'.
            #       To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
            #       values for 'ni' outside the 99.9th percentile are set to missing.
            COMP_df.nsi = np.where(COMP_df.nsi <= COMP_df.nsi.quantile(0.999), COMP_df.nsi, np.nan)
            COMP_df = COMP_df.drop(columns = ['owcap_adj', 'xp_allnan'])

            # CRSP ------------------------------------------------------------------------------------------------

            CRSP_df = self.query_CRSP(start_date, end_date, 'M')
            CRSP_df['jdate'] = CRSP_df.date

            # create timing columns
            CRSP_df['year'] = CRSP_df['jdate'].dt.year
            CRSP_df['month'] = CRSP_df['jdate'].dt.month

            # turnover (turn)
            # The turnover (TURN) for each stock is defined the monthly traded volume scaled by the total number of shares outstanding.
            CRSP_df['turn'] = CRSP_df.vol / CRSP_df.shrout

            # traded volume in dollars (dvol)
            # The traded volume in dollars (DVOL) is defined as the number of shares traded in a given month multiplied by the closing stock price.
            CRSP_df['dvol'] = CRSP_df.vol * CRSP_df.prc.abs()

            # calculate prior returns
            # Prior 1-1 is the cummulative return in [t - 1]
            # Prior 2-12 is the cummulative return from [t - 12] to [t - 2]
            # Prior 13-60 is the cummulative return from [t - 60] to [t - 13]
            for ret_typ in ['adjret', 'adjretx']:
                for typ in [(1, 1), (2, 12), (13, 60)]:
                    name = f'pr{typ[0]}_{typ[1]}' if(ret_typ == 'adjret') else f'prx{typ[0]}_{typ[1]}'
                    CRSP_df[name] = 1
                    for i in range(typ[0], typ[1] + 1):
                        CRSP_df[f'{ret_typ}_L{i}'] = 1 + CRSP_df.groupby(by = ['permno'])[ret_typ].shift(i)
                        CRSP_df[name] *= CRSP_df[f'{ret_typ}_L{i}']
                    CRSP_df = CRSP_df.drop(CRSP_df.filter(regex = '_L').columns, axis = 1)
                    CRSP_df[name] -= 1

            ### Aggregate Market Cap ###
            # sum of me across different permno belonging to same permco a given date
            crsp_summe = CRSP_df.groupby(['jdate','permco'])['me'].sum().reset_index()

            # largest mktcap within a permco/date
            crsp_maxme = CRSP_df.groupby(['jdate','permco'])['me'].max().reset_index()

            # join by jdate/maxme to find the permno
            CRSP_df = CRSP_df.merge(crsp_maxme, how='inner', on=['jdate','permco','me'])

            # drop me column and replace with the sum me
            CRSP_df = CRSP_df.drop(columns = ['me'])

            # join with sum of me to get the correct market cap info
            CRSP_df = CRSP_df.merge(crsp_summe, how='inner', on=['jdate','permco'])

            ### July to June dates
            CRSP_df['ffdate'] = CRSP_df['jdate'] + MonthEnd(-6)
            CRSP_df['ffyear'] = CRSP_df['ffdate'].dt.year
            CRSP_df['ffmonth'] = CRSP_df['ffdate'].dt.month
            CRSP_df['1+adjretx'] = 1 + CRSP_df.adjretx

            # cumret by stock and fama-french year
            CRSP_df['ffyear_cumretx'] = CRSP_df.groupby(by = ['permno', 'ffyear'])['1+adjretx'].cumprod()

            # lag of ffyear_cumretx
            CRSP_df['L_ffyear_cumretx'] = CRSP_df.groupby(by = ['permno'])['ffyear_cumretx'].shift(1)

            # lag market cap
            CRSP_df['L_me']=CRSP_df.groupby(by = ['permno'])['me'].shift(1)

            # if first permno then use me/(1+retx) to replace the missing value
            CRSP_df['months_in'] = CRSP_df.groupby(by = ['permno']).cumcount()
            CRSP_df.L_me = np.where(CRSP_df.months_in == 0, CRSP_df.me / CRSP_df['1+adjretx'], CRSP_df.L_me)

            # baseline me
            mebase = CRSP_df[CRSP_df.ffmonth == 1][['permno','ffyear', 'L_me']].rename(columns={'L_me': 'mebase'})

            # merge result back together
            CRSP_df = CRSP_df.merge(mebase, how = 'left', on = ['permno', 'ffyear'])
            CRSP_df['wt'] = np.where(CRSP_df.ffmonth == 1, CRSP_df.L_me, CRSP_df.mebase * CRSP_df.L_ffyear_cumretx)

            # Info as of June & December and merge together for characteristics
            CRSP_JUN_df = CRSP_df[CRSP_df.month == 6]
            CRSP_DEC_df = CRSP_df[CRSP_df.month == 12]
            CRSP_DEC_df = CRSP_DEC_df[['permno','date','jdate','me','year']].rename(columns={'me': 'dec_me'})
            CRSP_DEC_df.year += 1
            CRSP_DEC_df = CRSP_DEC_df[['permno','year','dec_me']]
            CRSP_JUN_df = CRSP_JUN_df.merge(CRSP_DEC_df, how = 'inner', on = ['permno', 'year'])
            CRSP_JUN_df = CRSP_JUN_df[['permno', 'date', 'jdate', 'shrcd', 'exchcd', 'adjret', 'me', 'wt', 'mebase', 'L_me', 'dec_me']]
            CRSP_JUN_df = CRSP_JUN_df.drop_duplicates()

            # query the link table
            link_table_df = self.query_link_table()
            CCM_df = COMP_df.merge(link_table_df, how='left', on=['gvkey'])
            CCM_df['jdate'] = CCM_df.year_end + MonthEnd(6)

            # set link date bounds
            CCM_df = CCM_df[(CCM_df.jdate >= CCM_df.linkdt) & (CCM_df.jdate <= CCM_df.linkenddt)]

            # link COMP_df and crsp
            CCM_JUN_df = CRSP_JUN_df.merge(CCM_df, how = 'inner', on = ['permno', 'jdate'])
            CCM_JUN_df['ffyear'] = CCM_JUN_df.jdate.dt.year

            # drop columns before merging
            CCM_JUN_df = CCM_JUN_df.drop(columns = ['mebase', 'L_me', 'linktype', 'linkprim', 'linkenddt', 'linkdt', 'date', 'jdate', 'shrcd', 'exchcd', 'adjret', 'me', 'wt', 'year', 'permco'])
            CRSP_df = CRSP_df.drop(columns = ['L_me', 'L_ffyear_cumretx', 'ffyear_cumretx', '1+adjretx', 'mebase', 'jdate'])

            # merge to monthly
            CCM_df = CRSP_df.merge(CCM_JUN_df, how = 'left', on = ['permno', 'ffyear'])
            CCM_df = CCM_df.dropna(subset = ['datadate'])
            CCM_df = CCM_df.sort_values(by = ['permno', 'date'])

            # create characterisitcs

            # book to market
            CCM_df['ffbm'] = np.where(CCM_df.dec_me != 0, CCM_df.be / CCM_df.dec_me, np.nan)
            CCM_df['bm'] = np.where(CCM_df.me != 0, CCM_df.be / CCM_df.me, np.nan)

            # earnings to price
            CCM_df['ffep'] = np.where(CCM_df.dec_me != 0, CCM_df.earn / CCM_df.dec_me, np.nan)
            CCM_df['ep'] = np.where(CCM_df.me != 0, CCM_df.earn / CCM_df.me, np.nan)

            # cashflow to price
            CCM_df['ffcfp'] = np.where(CCM_df.dec_me != 0, CCM_df.cf / CCM_df.dec_me, np.nan)
            CCM_df['cfp'] = np.where(CCM_df.me != 0, CCM_df.cf / CCM_df.me, np.nan)

            # market beta
            mkt_s = CCM_df.groupby('date').apply(wavg, 'adjret', 'me')
            mkt_s.name = 'mkt'
            mkt_s = mkt_s.to_frame().reset_index()
            CCM_df = CCM_df.merge(mkt_s, how = 'left', on = ['date'])

            CCM_df['cov'] = CCM_df.groupby(['permno'])[['adjret', 'mkt']].rolling(min_periods = 24, window = 60).cov().unstack()['adjret']['mkt'].reset_index(level = 'permno').rename(columns = {'mkt': 'cov'})[['cov']]
            CCM_df['var'] = CCM_df.groupby(['permno'])['mkt'].rolling(min_periods = 24, window = 60).var().reset_index(level = 'permno').rename(columns = {'mkt': 'var'})[['var']]
            CCM_df['beta'] = CCM_df['cov'] / CCM_df['var']

            ## additional characteristics
            # sale-to-price
            CCM_df['sp'] = np.where(CCM_df.me != 0, CCM_df.sale / CCM_df.me, np.nan)


            #### ISSUES WITH GROWTH VARS

            # earnings growth
            CCM_df['eg'] = CCM_df.earn.pct_change()

            # grwoth in sales
            CCM_df['grs'] = CCM_df.sale.pct_change()

            # growth in book value
            CCM_df['grb'] = CCM_df.be.pct_change()

            # growth in cashflow
            CCM_df['grcf'] = CCM_df.cf.pct_change()

            def _percentile_rank(df, var):
                ptiles = list(df[var].quantile(q = list(np.arange(start = 0, step = 0.01, stop = 1))))
                df[f'{var}_pr'] = 100
                for i in range(99, 0, -1):
                    mask = df[var] < ptiles[i]
                    df.loc[mask, f'{var}_pr'] = i
                return(df)
            
            for var in ['bm', 'sp', 'cfp', 'dp', 'eg', 'grs', 'grcf', 'grb']:
                CCM_df = _percentile_rank(CCM_df, var)
            
            s = time.time()

            # Pastor-Stambaugh Liquidty Index (psliq)
            # The Pastor-Stambaugh Liquidity Index (PSLIQ) is defined for each stock as follow. We run one regression for each calendar month based on each stock’s daily return, using the current daily
            # return in the left-hand side. The right-hand side variables are the lagged daily return as well as the lagged daily return interacted with the lagged traded dollar volume. The coefficient of the
            # interaction term is the measure of liquidity – for each stock and each month.

            # 









            e = time.time()
            print(CCM_df.head(50))
            print(f'time = {e - s}')


            # save merged dataframe to SQL database
            CCM_df = CCM_df.drop_duplicates()
            CCM_df.to_sql(table, con = self.sql_engine, if_exists = 'replace', index = False)
        
            cprint.ok('CRSP and Compustat merged table has been created.')
 
    def __str__(self) -> str:
        #TODO: print tables and columns from loacl database
        return(f"WRDS Username: {self.username}")

    def raw_sql(self, sql_str):
        """
        Allows the user to use raw SQL on the underlying database.

        Note
        _____
        This can cause irreversible damage to the underlying database that can only be fixed by deleting and reconstructing the database.
        """
        cprint.warn('The operation that you are about to perform might damage the local database. Do you wish to continue [y/n]:')
        response = input()
        if(response == 'y'):
            raw_df = pd.read_sql(sql_str, con = self.sql_engine)
            return(raw_df)
        else:
            cprint.info('Operation cancelled.')
            return(None)

    def query_CCM(self, start_date: datetime.date, end_date: datetime.date, **kwargs) -> pd.DataFrame:
        """
        Used to query the merged CRSP/Compustat (CCM) table.

        Parameters
        ___________
        start_date: The starting date of the data query.
        end_date:   The ending date of the data query.
        freq:       The frequency of the data query.
            Choices are:
                * Q: quarterly
                * A: annual

        Keyword Arguments
        __________________
        vars:     list; The variables to query for.
        add_vars: list; Additional variables to query for ontop of the default variables.
        sub_vars: list; Variables to remove from the default variables.
        all_vars: bool; Set to true to query for all variables in the table.
        id_type:  str;  Type of ID used to query for specific assets.
            Choices are:
                * ticker
                * gvkey
                * permno
                * cusip
                * permco
        ids:      list; The ids of type 'id_type' to query for.

        Note
        _____
        The variables that can be queiried for are:
            'gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', 
            'pstk', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'shrcd', 'exchcd', 
            'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev', 'pr2_12', 'pr1_1', 'pr13_60', 'prx2_12', 'prx1_1', 'prx13_60'

        Note
        _____
        If the frequency is quarterly the variables that can be queiried for are the same as the annual file except
            for 'pstkrv' and 'pstkl'.

        Note
        _____
        The defualt variables that are queried for are:
            'date', 'gvkey', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe',  
            'cfp', 'inv', 'op', 'PR2_12', 'PR1_1', 'PR13_60'
        """
        # vars that can be queiried for
        VALID_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'dp', 'year', 'month', 'pr1_1', 'pr2_12', 'pr13_60', 'prx1_1', 'prx2_12', 'prx13_60', 
                      'me', 'ffdate', 'ffyear', 'ffmonth', 'months_in', 'wt', 'dec_me', 'dltt', 'mib', 'revt', 'csho', 'adjex_f', 'act', 'xint', 'pstk', 'txdi', 'gvkey', 'ib', 'xsga', 'dlc', 'ceq', 'che', 'datadate', 'txdc', 'dpc', 'ibc', 
                      'fyear', 'pstkl', 'teq', 'cogs', 'pstkrv', 'lct', 'dpre', 'txditc', 'seq', 'at', 'sale', 'year_end', 'years_in', 'ps', 'be', 'earn', 'profit', 'op', 'inv', 'cf', 'csho_adj', 'd_owcap_adj', 'ac', 'ni_csho_adj', 'nsi', 'ffbm', 
                      'bm', 'ffep', 'ep', 'ffcfp', 'cfp', 'beta']

        # for annual
        #VALID_VARS = ['gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', 
        #                  'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 
        #                  'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev']

        # if no keyword arguments are given then these are the defaults returned
        DEFAULT_VARS = ['date', 'gvkey', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'datadate', 'year_end', 'ffdate', 'prc', 'shrout', 'adjret', 'adjretx', 
                        'me', 'wt', 'dp', 'be', 'bm', 'ffbm', 'ep', 'ffep', 'cfp', 'ffcfp', 'inv', 'op', 'pr2_12', 'pr1_1', 'pr13_60', 'beta', 'ac', 'nsi', 'years_in', 'months_in', 'month', 'ffyear']

        VARS_DATA_TYPE = {'permno': str, 'permco': str, 'ticker': str, 'shrcd': str, 'exchcd': str, 'prc': float, 'shrout': float, 'adjret': float, 'adjretx': float, 'adjcumret': float, 
                          'adjcumretx': float, 'dp': float, 'year': int, 'month': int, 'pr1_1': float, 'pr2_12': float, 'pr13_60': float, 'prx1_1': float, 'prx2_12': float, 'prx13_60': float, 
                          'me': float, 'ffyear': int, 'ffmonth': int, 'months_in': int, 'wt': float, 'dec_me': float, 'dltt': float, 'mib': float, 'revt': float, 'csho': float, 'adjex_f': float, 
                          'act': float, 'xint': float, 'pstk': float, 'txdi': float, 'gvkey': str, 'ib': float, 'xsga': float, 'dlc': float, 'ceq': float, 'che': float, 'txdc': float, 'dpc': float, 'ibc': float, 
                          'fyear': int, 'pstkl': float, 'teq': float, 'cogs': float, 'pstkrv': float, 'lct': float, 'dpre': float, 'txditc': float, 'seq': float, 'at': float, 'sale': float, 'years_in': int, 
                          'ps': float, 'be': float, 'earn': float, 'profit': float, 'op': float, 'inv': float, 'cf': float, 'csho_adj': float, 'd_owcap_adj': float, 'ac': float, 'ni_csho_adj': float, 'nsi': float, 'ffbm': float, 
                          'bm': float, 'ffep': float, 'ep': float, 'ffcfp': float, 'cfp': float, 'beta': float}

        #############################################################################################################################################
        # Query Validation and Error Checking
        #############################################################################################################################################

        # keywrods 'additional_vars' and 'vars' cannot be used simultaneously
        if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')

        # create list of the variables being quireied 
        query_vars = None
        if('vars' in kwargs):
            # variable arguments to query for
            query_vars = kwargs['vars']

            # 'permno' needs to be in the query vars for merging
            if('permno' not in query_vars): query_vars.insert(0, 'permno')

            # add date if people forgot
            if('date' not in query_vars): query_vars.insert(0, 'date')
        else:
            if('add_vars' in kwargs):
                query_vars = DEFAULT_VARS + kwargs['add_vars']
            else:
                query_vars = DEFAULT_VARS
            if('sub_vars' in kwargs):
                query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']]
            
        if('all_vars' in kwargs): query_vars = VALID_VARS

        # make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it)
        all_valid = all(elem in VALID_VARS for elem in query_vars)
        if(not all_valid):
            incorrect_vars = list(set(query_vars) - set(VALID_VARS))
            raise Exception(f'Variables {incorrect_vars} cannot be queried from the combined CRSP/Compustat merged table. The CCM table does not contain all of the variables that are in CRSP and Compustat.')
        
        specific_query = False
        id_type = ''
        ids = []
        if('id_type' in kwargs or 'ids' in kwargs):
            if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
            if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
            specific_query = True
            id_type = kwargs['id_type']
            ids = kwargs['ids']

        ##############################################################################################################################################
        # Load the raw data
        ##############################################################################################################################################

        # read in raw dataframe from local sql database
        raw_df = pd.read_sql(self._CCM_sql_query(start_date, end_date, vars = query_vars, 
                                                 specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)

        ##############################################################################################################################################
        # Clean the raw data
        ##############################################################################################################################################

        # I HATE U SEC 
        if(query_vars is None): 
            raw_df.fyear = raw_df.fyear.astype(float)
        if(not query_vars is None):
            if('fyear' in query_vars): 
                raw_df.fyear = raw_df.fyear.astype(float)

        # get vars in the dataframe
        quried_vars = list(set(list(raw_df.columns)) - set(['date', 'datadate', 'ffdate', 'year_end']))
        vars_dtypes = {}
        for var in quried_vars:
            vars_dtypes[var] = VARS_DATA_TYPE[var]

        # convert to correct data types
        raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d')
        raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d')
        raw_df.ffdate = pd.to_datetime(raw_df.ffdate, format = '%Y-%m-%d')
        raw_df.year_end = pd.to_datetime(raw_df.year_end, format = '%Y-%m-%d')
        raw_df = raw_df.astype(vars_dtypes)

        # replace and python objects 'None' to np.nan
        raw_df = raw_df.fillna(value = np.nan)

        # reset to original variables, drop duplicates, and reset the index
        raw_df = raw_df[query_vars]
        raw_df = raw_df.drop_duplicates()
        raw_df = raw_df.sort_values(by = ['permno', 'date'])
        raw_df = raw_df.reset_index(drop = True)

        # return dataframe
        return(raw_df)

    def query_Compustat(self, start_date: datetime.date, end_date: datetime.date, freq: str, **kwargs) -> pd.DataFrame:
        """
        Used to query the raw Compustat tables.

        Parameters
        ___________
        start_date: The starting date of the data query.
        end_date:   The ending date of the data query.
        freq:       The frequency of the data query.
            Choices are:
                * Q: quarterly
                * A: annual

        Keyword Arguments
        __________________
        vars:     list; The variables to query for.
        add_vars: list; Additional variables to query for ontop of the default variables.
        sub_vars: list; Variables to remove from the default variables.
        all_vars: bool; Set to true to query for all variables in the table.
        id_type:  str;  Type of ID used to query for specific assets.
            Choices are:
                * ticker
                * gvkey
                * permno
                * cusip
                * permco
        ids:      list; The ids of type 'id_type' to query for.

        Note
        _____
        The defualt variables that are queired for from the quarterly file have their names changed to mirror those in the annual
            file. In most cases this means removing a 'q' at the end of the variable name. For example, in the annual file the 
            fiscal year variable is 'fyear' while in the quarterly file the name is 'fyearq'. This name change is done to 
            the dataframe that will be returned in RAM and not to the underlying Compustat table on DISK. The change is done to make 
            it easier to compute the anomally characterisitcs when creating the combined CCM tables.  

        Note
        _____
        By use of the 'add_vars' or 'vars' keyword arguments you can query for the approximately 1000 variables that Compustat tracks.
            To do this you need to know the actual name of the varibale that you want to query for, paying attention to Compustat's
            naming conventions between their annual and quarterly files.
            

        Note
        _____
        The defualt variables that are queried for are if the frequency given is annual:
            'gvkey', 'date', 'fyear', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dp', 'che', 'dlc', 'ceq', 'seq', 
            'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib'

            If the frequency is quarterly it is the same variables excluding 'pstkrv' and 'pstkl'.

        Note
        _____
        There is less error checking in this function compared to the other methods in this class because of the large number of variables
            in Compustat.
        """
        STD_VARS = None
        if(freq == 'A'):
            STD_VARS = ['gvkey', 'datadate', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdc', 'dpc', 'che', 'dlc', 'ceq', 'seq', 'teq', 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'ibc', 'dltt', 'mib', 'ib', 'dp']
        else:
            STD_VARS = ['gvkey', 'datadate', 'tic', 'atq', 'saleq', 'cogsq', 'actq', 'txdiq', 'cshoq', 'lctq', 'txdcy', 'dpcy', 'cheq', 'dlcq', 'ceqq', 'seqq', 'teqq', 'pstkq', 'txditcq', 'xintq', 'xsgaq', 'ibcy', 'dlttq', 'mibq', 'ibq', 'dpq']

        DEFAULT_DTYPES = {'gvkey': str, 'ticker': str, 'at': float, 'sale': float, 'cogs': float, 'act': float, 'txdi': float, 'csho': float, 'lct': float, 'dltt': float, 'mib': float,
                          'txdc': float, 'dpre': float, 'che': float, 'dlc': float, 'ceq': float, 'seq': float, 'teq': float, 'pstk': float, 'txditc': float, 'xint': float, 'xsga': float, 'ibc': float, 'ib': float}

        CREATED_VARS = ['years_in']

        #############################################################################################################################################
        # Query Validation and Error Checking
        #############################################################################################################################################

        if(freq not in ['Q', 'A']): raise Exception('Invlaid frequency given to query_compustat')

        # keywrods 'additional_vars' and 'vars' cannot be used simultaneously
        if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')

        # create list of the variables being quireied 
        query_vars = None
        if('vars' in kwargs):
            # variable arguments to query for
            query_vars = kwargs['vars']

            # 'permno' needs to be in the query vars for merging
            if('gvkey' not in query_vars): query_vars.insert(0, 'gvkey')

            # add date if people forgot
            if('datadate' not in query_vars and 'date' not in query_vars): query_vars.insert(0, 'datadate')
        else:
            if('add_vars' in kwargs):
                query_vars = STD_VARS + kwargs['add_vars']
            else:
                query_vars = STD_VARS
            if('sub_vars' in kwargs):
                sub_vars = ['tic' if elem == 'ticker' else elem for elem in kwargs['sub_vars']]
                query_vars = [elem for elem in query_vars if elem not in sub_vars]

        query_vars = ['datadate' if elem == 'date' else elem for elem in query_vars]
        query_vars = ['tic' if elem == 'ticker' else elem for elem in query_vars]
        query_vars = ['conm' if elem == 'comnam' else elem for elem in query_vars]

        if('all_vars' in kwargs): query_vars = None

        indfmts = kwargs['indfmts'] if('indfmts' in kwargs) else ['INDL']     # default: Industrial, Financial
        datafmts = kwargs['datafmts'] if('datafmts' in kwargs) else ['STD']         # default: Standard
        popsrcs = kwargs['popsrcs'] if('popsrcs' in kwargs) else ['D']              # default: Consolidated
        consols = kwargs['consols'] if('consols' in kwargs) else ['C']              # default: Consolidated

        specific_query = False
        id_type = ''
        ids = []
        if('id_type' in kwargs or 'ids' in kwargs):
            if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
            if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
            specific_query = True
            id_type = kwargs['id_type']
            if(id_type == 'ticker'): id_type = 'tic'
            ids = kwargs['ids']

        query_vars_DB = list(set(query_vars) - set(CREATED_VARS))

        ##############################################################################################################################################
        # Load the raw data
        ##############################################################################################################################################

        # read in raw dataframe from local sql database
        raw_df = pd.read_sql(self._compustat_SQL_query(start_date, end_date, freq, vars = query_vars_DB, 
                                                       indfmt = indfmts, datafmt = datafmts, popsrc = popsrcs, consol = consols,
                                                       specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)


        ##############################################################################################################################################
        # Clean the raw data
        ##############################################################################################################################################

        # rename columns
        raw_df = raw_df.rename(columns = {'tic': 'ticker', 'conm':'comnam'})

        # rename the default columns to match the names from the COMPA_FUNDA
        if(freq == 'Q'):
            # quarterly compustat
            # dont balme me for the different names blame compustat
            raw_df = raw_df.rename(columns = {'atq':'at', 'seqq':'seq', 'ceqq':'ceq', 'teqq':'teq',
                                              'pstkq':'pstk', 'txdcy':'txdc', 'txditcq':'txditc', 'saleq':'sale',
                                              'cogsq':'cogs', 'xintq':'xint', 'xsgaq':'xsga', 'mibq':'mib', 
                                              'ibcy':'ibc', 'txdiq':'txdi', 'dpq':'dpre', 'cshoq':'csho', 'adjex':'adjex_f',
                                              'actq':'act', 'lctq':'lct', 'cheq':'che', 'dlcq':'dlc', 'dlttq': 'dltt', 'ibq': 'ib'})
        else:
            # annual compustat
            # rename columns for consistency
            raw_df = raw_df.rename(columns = {'dp': 'dpre'})

        # make date a datetime.date and align to the end of the year/quarter
        raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d')
        if(freq == 'A'):
            raw_df['year_end'] = raw_df.datadate + YearEnd(0)
        else:
            raw_df['quarter_end'] = raw_df.datadate + QuarterEnd(0)

        # I HATE U S&P
        if('fyear' in query_vars): 
            raw_df.fyear = raw_df.fyear.astype(float)

        # add years in to compustat
        if('years_in' in query_vars):
            raw_df['years_in'] = raw_df.groupby(by = ['gvkey']).cumcount()
                
        # get vars in the dataframe
        quried_vars = list(set(list(raw_df.columns)) - set(['date']))
        vars_dtypes = {}
        for var in quried_vars:
            if(var in DEFAULT_DTYPES):
                vars_dtypes[var] = DEFAULT_DTYPES[var]

        # convert dtypes
        raw_df = raw_df.fillna(value = np.nan)
        raw_df = raw_df.astype(vars_dtypes)

        # sort just for ease of reading
        raw_df = raw_df.drop_duplicates()
        sorting_dims = ['gvkey', 'year_end'] if(freq == 'A') else ['gvkey', 'quarter_end']
        raw_df = raw_df.sort_values(by = sorting_dims)
        raw_df = raw_df.reset_index(drop = True)

        # return the dataframe
        return(raw_df)

    def query_CRSP(self, start_date: datetime.date, end_date: datetime.date, freq: str, adj_stocksplit: bool = True, **kwargs) -> pd.DataFrame:
        """
        Used to query the raw CRSP files. Additonally, variables can be created and stock splits can be adjusted for.

        Parameters
        ___________
        start_date: The starting date of the data query.
        end_date:   The ending date of the data query.
        freq:       The frequency of the data query.
            Choices are:
                * M: quarterly
                * D: annual
        adj_stocksplit: default = True; Whether or not to adjust for a stock split event.

        Keyword Arguments
        __________________
        vars:     list; The variables to query for.
        add_vars: list; Additional variables to query for ontop of the default variables.
        sub_vars: list; Variables to remove from the default variables.
        all_vars: bool; Set to true to query for all variables in the table.
        id_type:  str;  Type of ID used to query for specific assets.
            Choices are:
                    * ticker
                    * gvkey
                    * permno
                    * cusip
                    * permco
        ids:      list; The ids of type 'id_type' to query for.
        exchcds:  list; The exchange codes to use for querying (default: NYSE, NYSE Market (formally AMEX), and NASDAQ)
        shrcds:   list; The share codes to use for querying (default: US Common Stock) 

        Note
        _____
        If the frequency is monthly then the variables that can be quiried for are:
            'date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 
            'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 
            'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 
            'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt'

        Note
        _____
        If the frequency is daily then the variables that can be quiried for are:
            'date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 
            'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 
            'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 
            'nwperm', 'dlamt', 'openprc', 'numtrd'
            
        Note
        _____
        The variabels that are created from CRSP primatives are:
            * 'me':         Market Equity (millions)
            * 'adjret':     Returns adjusted for delisting events
            * 'adjretx':    Returns adjusted for delisting events ex. dividend
            * 'dvd':        Dividend (uses 'adjret' and 'adjretx' to calculate)
            * 'dp':         Dividend-to-Price Ratio 

        Note
        _____
        This function defaults to querying for all companies that are consistent with Famas defintions. That is to say assets with a share code of 10 or 11 
            and an exchange code of 1, 2, or 3.
        """
        # variables that can be queried for
        STD_VARS = None
        if(freq == 'M'):
            STD_VARS = ['date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 
                        'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 
                        'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 
                        'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt']
        else:
            STD_VARS = ['date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 
                        'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 
                        'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 
                        'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt', 'openprc', 'numtrd']

        CREATE_VARS = ['me', 'adjret', 'adjretx', 'dvd', 'dp', 'cumret', 'cumretx', 'adjcumret', 'adjcumretx']

        VALID_VARS = STD_VARS + CREATE_VARS

        # if no keyword arguments are given then these are the defaults returned
        DEFAULT_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'me', 'dp', 'vol']

        # variable data types
        VARS_DATA_TYPE = {'cusip': str, 'permno': str, 'permco' : str, 'comnam': str, 'compno': str, 'ticker': str, 
                          'primexch': str, 'tsymbol': str, 'secstat': str, 'hsiccd': str, 'naics': str, 'siccd': str, 'trdstat': str, 'ncusip': str,
                          'shrcd': str, 'exchcd': str, 'issuno': str, 'hexcd': str, 'shrcls': str, 
                          'ret': float, 'retx': float, 'shrout': float, 'prc': float, 'cfacshr': float, 'cfacpr': float, 
                          'bidlo': float, 'bid': float, 'ask': float, 'askhi': float, 'spread': float, 'altprc': float, 'vol': float,
                          'dlstdt': str, 'dlstcd': str, 'nwperm': str, 'nwcomp': str, 'nextdt': str, 'dlamt': float, 'dlretx': float, 'dlprc': float, 
                          'dlpdt': str, 'dlret': float, 'acperm': str, 'accomp': str, 'me': float, 'adjret': float, 'adjretx': float, 'dvd': float, 
                          'adjdvd': float, 'dp': float, 'openprc': float, 'numtrd': float, 'cumret': float, 'cumretx': float, 'adjcumret': float, 'adjcumretx': float}

        #############################################################################################################################################
        # Query Validation and Error Checking
        #############################################################################################################################################

        if(freq not in ['D', 'M']): raise Exception('Invlaid frequency given to query_CRSP')

        # keywrods 'additional_vars' and 'vars' cannot be used simultaneously
        if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')

        # create list of the variables being quireied 
        query_vars = None
        if('vars' in kwargs):
            # variable arguments to query for
            query_vars = kwargs['vars']

            # 'permno' needs to be in the query vars for merging
            if('permno' not in query_vars): query_vars.insert(0, 'permno')

            # add date if people forgot
            if('date' not in query_vars): query_vars.insert(0, 'date')
        else:
            if('add_vars' in kwargs):
                query_vars = DEFAULT_VARS + kwargs['add_vars']
            else:
                query_vars = DEFAULT_VARS
            if('sub_vars' in kwargs):
                query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']]
            

        if('all_vars' in kwargs): query_vars = VALID_VARS

        # used for dataframe formatting at the end
        og_vars = query_vars.copy()

        # make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it)
        all_valid = all(elem in VALID_VARS for elem in query_vars)
        if(not all_valid):
            incorrect_vars = list(set(query_vars) - set(VALID_VARS))
            raise Exception(f'Variables {incorrect_vars} cannot be queried from CRSP.')

        # always adjust for stock splits (can disable this)
        if(adj_stocksplit):
            if('prc' not in query_vars): query_vars.append('prc')
            if('cfacpr' not in query_vars): query_vars.append('cfacpr')
            if('shrout' not in query_vars): query_vars.append('shrout')
            if('cfacshr' not in query_vars): query_vars.append('cfacshr')

        # make sure if created variables are being queiried for then add the variables needed to create them
        if('me' in query_vars):
            if('prc' not in query_vars): query_vars.append('prc')
            if('shrout' not in query_vars): query_vars.append('shrout')

        if('dp' in query_vars):
            if('dvd' not in query_vars): query_vars.append('dvd')

        if('dvd' in query_vars):
            if('adjret' not in query_vars): query_vars.append('adjret')
            if('adjretx' not in query_vars): query_vars.append('adjretx')
            if('prc' not in query_vars): query_vars.append('prc')

        if('adjret' in query_vars):
            if('ret' not in query_vars): query_vars.append('ret')
            if('dlret' not in query_vars): query_vars.append('dlret')

        if('adjretx' in query_vars):
            if('retx' not in query_vars): query_vars.append('retx')
            if('dlretx' not in query_vars): query_vars.append('dlretx')

        if('cumret' in query_vars):
            if('ret' not in query_vars): query_vars.append('ret')

        if('cumretx' in query_vars):
            if('retx' not in query_vars): query_vars.append('retx')

        if('adjcumret' in query_vars):
            if('ret' not in query_vars): query_vars.append('ret')
            if('dlret' not in query_vars): query_vars.append('dlret')
            
        if('adjcumretx' in query_vars):
            if('retx' not in query_vars): query_vars.append('retx')
            if('dlretx' not in query_vars): query_vars.append('dlretx')


        exchcds = kwargs['exchcds'] if('exchcds' in kwargs) else [1, 2, 3] # default: NYSE, NYSE MKT, NASDAQ
        shrcds = kwargs['shrcds'] if('shrcds' in kwargs) else [10, 11]     # default: US-based common stock

        specific_query = False
        id_type = ''
        ids = []
        if('id_type' in kwargs or 'ids' in kwargs):
            if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
            if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
            specific_query = True
            id_type = kwargs['id_type']
            ids = kwargs['ids']

        # created vars are not in the table so remove them
        db_vars = [var for var in query_vars if var not in CREATE_VARS]

        ##############################################################################################################################################
        # Load the raw data
        ##############################################################################################################################################

        # read in raw dataframe from local sql database
        raw_df = pd.read_sql(self._CRSP_SQL_query(start_date, end_date, freq, vars = db_vars, exchcds = exchcds, shrcds = shrcds, 
                                                  specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)

        ##############################################################################################################################################
        # Clean the raw data
        ##############################################################################################################################################

        # I HATE U SEC
        DOWNCAST_VARS = ['permno', 'permco', 'exchcd', 'issuno', 'hexcd', 'shrcd', 'compno', 'hsiccd', 'naics', 'siccd', 'acperm', 'accomp', 'dlstcd', 'nwcomp', 'nwperm']
        for var in DOWNCAST_VARS:
            if(var in query_vars):
                raw_df[var] = raw_df[var].astype('Int64')

        # get vars in the dataframe
        quried_vars = list(set(list(raw_df.columns)) - set(['altprcdt', 'date', 'nameendt', 'namedt', 'dlstdt', 'nextdt', 'dlpdt']))
        vars_dtypes = {}
        for var in quried_vars:
            vars_dtypes[var] = VARS_DATA_TYPE[var]

        # convert dates to datetime.dates and align to end of month 
        raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d')
        if(freq == 'M'): raw_df.date += MonthEnd(0)

        if('altprcdt' in query_vars): raw_df.altprcdt = pd.to_datetime(raw_df.altprcdt, format = '%Y-%m-%d')
        if('nameendt' in query_vars): raw_df.nameendt = pd.to_datetime(raw_df.nameendt, format = '%Y-%m-%d')
        if('namedt' in query_vars): raw_df.namedt = pd.to_datetime(raw_df.namedt, format = '%Y-%m-%d')
        if('dlstdt' in query_vars): raw_df.dlstdt = pd.to_datetime(raw_df.dlstdt, format = '%Y-%m-%d')
        if('nextdt' in query_vars): raw_df.nextdt = pd.to_datetime(raw_df.nextdt, format = '%Y-%m-%d')
        if('dlpdt' in query_vars): raw_df.dlpdt = pd.to_datetime(raw_df.dlpdt, format = '%Y-%m-%d')

        # make sure that the data is the correct type
        raw_df = raw_df.astype(vars_dtypes)

        # replace and python objects 'None' to np.nan
        raw_df = raw_df.fillna(value = np.nan)

        # adjust for stock splits
        if(adj_stocksplit):
            raw_df.prc /= raw_df.cfacpr
            raw_df.shrout *= raw_df.cfacshr

        # Market Equity. Market equity (size) is price times shares outstanding. Price and shares outstanding from CRSP.
        # SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html
        if('me' in query_vars):
            raw_df['me'] = raw_df.prc.abs() * raw_df.shrout
            # convert market equity to $millions
            raw_df.me /= 1e3

        # adjust for delisting return
        if('adjret' in query_vars):
            raw_df.dlret = raw_df.dlret.fillna(value = 0.0)
            raw_df['adjret'] = ((1 + raw_df.ret) * (1 + raw_df.dlret)) - 1
        if('adjretx' in query_vars):
            raw_df.dlretx = raw_df.dlretx.fillna(value = 0.0)
            raw_df['adjretx'] = ((1 + raw_df.retx) * (1 + raw_df.dlretx)) - 1

        # create dividends paid using 'adjret' and 'adjretx' then 'ret' and 'retx' in that order
        if('adjret' in query_vars and 'adjretx' in query_vars):
            raw_df['dvd'] = (raw_df.adjret - raw_df.adjretx) * raw_df.groupby(['permco'])['prc'].shift(1).abs()

        # create cummulative returns
        if('cumret' in query_vars):
            raw_df['1+ret'] = 1 + raw_df.ret
            raw_df['cumret'] = raw_df.groupby(by = ['permno'])['1+ret'].cumprod()
            raw_df = raw_df.drop(columns = ['1+ret'])

        if('cumretx' in query_vars):
            raw_df['1+retx'] = 1 + raw_df.ret
            raw_df['cumretx'] = raw_df.groupby(by = ['permno'])['1+retx'].cumprod()
            raw_df = raw_df.drop(columns = ['1+retx'])

        if('adjcumret' in query_vars):
            raw_df['1+adjret'] = 1 + raw_df.ret
            raw_df['adjcumret'] = raw_df.groupby(by = ['permno'])['1+adjret'].cumprod()
            raw_df = raw_df.drop(columns = ['1+adjret'])

        if('adjcumret' in query_vars):
            raw_df['1+adjretx'] = 1 + raw_df.ret
            raw_df['adjcumretx'] = raw_df.groupby(by = ['permno'])['1+adjretx'].cumprod()
            raw_df = raw_df.drop(columns = ['1+adjretx'])

        # Dividend Yield. The dividend yield used to form portfolios in June of year t is the total dividends paid from July of t-1 
                # to June of t per dollar of equity in June of t. The dividend yield is computed using the with and without dividend returns 
                # from CRSP, as described in Fama and French, 1988, “Dividend yields and expected stock returns,” Journal of Financial Economics 25.
                # SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html
        # NOTE: Following Fama Fench the dividend price ratio uses the last year of dividends paid out if possible with a minimum 7 months.
        if('dp' in query_vars):
            if(freq == 'M'):
                min_periods = 7
                window = 12
            else:
                min_periods = 147 # 252 days / 12 months * 7 months
                window = 252
            raw_df['cumdvd'] = raw_df.groupby(['permno'])['dvd'].rolling(min_periods = min_periods, window = window).sum().reset_index(level = 'permno')[['dvd']]
            raw_df['dp'] = raw_df.cumdvd / raw_df.prc.abs()
            raw_df.dp = np.where((raw_df.dp.isnull()) | (raw_df.dp < 0), np.nan, raw_df.dp)
            raw_df = raw_df.drop(columns = ['cumdvd'])

        # reset to original variables, drop duplicates, and reset the index
        raw_df = raw_df[og_vars]
        raw_df = raw_df.drop_duplicates()
        raw_df = raw_df.sort_values(by = ['permno', 'date'])
        raw_df = raw_df.reset_index(drop = True)

        # return the raw dataframe and path where it was saved
        return(raw_df)

    def query_riskfree(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) -> pd.DataFrame:
        """
        Query the risk-free rate from the Fama-French library on local WRDS. This rate is equivalent to the 1 month T-Bill rate.

        Parameters
        ___________
        start_date: datetime.date\n
            Starting date of the dataset being queried.

        end_date: datetime.date\n
            Ending date of the dataset being queried.

        obs_freq: str\n
            The observational frequency of the CRSP database being queried.
                Choices are:
                    * 'D' : daily
                    * 'M' : monthly
                    * 'A' : annually

        Returns
        ________
        full_df: pd.DataFrame\n
            Risk-free rate data.

        Note
        _____
        The dataframe returned makes adjustments for NYSE holidays during compounding.

        Note
        _____
        List of queried CRSP variables:\n
            * date : Date of observation
            * rf   : Risk-free rate
        """
        # Since monthly observations have a date starting on the 1st of each month, then for any 'start_date' that doesn't
        # coincide w/ the 1st of any month, we adjust it so it does and the query pulls the monthly observation of interest.
        if(obs_freq in ['M', 'A'] and start_date != (start_date + MonthBegin(-1)).date()):
            start_date = (start_date + MonthBegin(-1)).date()

        # load in dataframe
        raw_df = pd.read_sql(self._rf1m_SQL_query(start_date, end_date, obs_freq), con = self.sql_engine)

        # convert dates to datetimes
        raw_df['date'] = pd.to_datetime(raw_df['date'])

        # Convert trading dates to end-of-period if 'freq' does not pertain to daily frequency.
        if(obs_freq == 'M'):
            raw_df['date'] = raw_df['date'] + MonthEnd(0)
        elif(obs_freq == 'A'):
            raw_df['date'] = raw_df['date'] + YearEnd(0)

        # return the raw dataframe
        return(raw_df)

    def query_link_table(self) -> pd.DataFrame:
        """
        Query the CRSP/Compustat (CCM) Merged Linking Table needed to merge CRSP securities to
            Compustat companies on permno and gvkey.

        Returns
        ________
        raw_df: pd.DataFrame\n
            The raw dataframe pulled from local WRDS database.

        Note
        _____
        Currently this function only works if a local copy of the WRDS database exits w/ the CCM Linktable.
        """
        sql_str = """
                    SELECT gvkey, lpermno as permno, lpermco as permco, linktype, linkprim, linkdt, linkenddt
                    FROM CRSP_CCMXPF_LINKTABLE
                    WHERE substr(linktype, 1, 1) = 'L'
                    AND (linkprim = 'C' or linkprim = 'P')
                  """

        # read in raw dataframe from local database
        raw_df = pd.read_sql(sql_str, con = self.sql_engine)

        # convert permno and permco to string
        raw_df.permco = pd.to_numeric(raw_df.permco, downcast = 'integer')
        raw_df.permno = pd.to_numeric(raw_df.permno, downcast = 'integer')

        # convert identifiers to strings
        raw_df.gvkey = raw_df.gvkey.astype(str)
        raw_df.permno = raw_df.permno.astype(str)
        raw_df.permco = raw_df.permco.astype(str)

        # if linkenddt is missing the set to todays date
        raw_df.linkenddt = raw_df.linkenddt.fillna(pd.to_datetime('today').date())

        # convert to datetimes
        raw_df.linkdt = raw_df.linkdt.astype('datetime64[ns]')
        raw_df.linkenddt = raw_df.linkenddt.astype('datetime64[ns]')

        # return the raw dataframe
        return(raw_df)

# ----------------------------------------------------------------------------------------------------------------------------
# INTERNAL METHODS (class <QueryWRDS>)
#
# These are internal methods and should only be called within this class. Functionality and accuracy of these methods cannot
# garunteed if they are called outside of this class.
# ----------------------------------------------------------------------------------------------------------------------------

    # INTERNAL METHOD
    def _list_to_sql_str(self, lst: list, table: str = None) -> str:
        res = ''
        for var in lst:
            if(table is None):
                res += f'\'{var}\', '
            else:
                res += f'{table}.{var}, '
        res = res[:-2]
        return(res)

    # INTERNAL METHOD
    def _CCM_sql_query(self, start_date: datetime.date, end_date: datetime.date, vars: list, specific_query: bool, id_type: str, ids: list):
        sql_str = ''
        table = 'CCM'

        # convert date time object to strings for the SQL query
        start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\''
        end_date_str  = '\'' + end_date.strftime('%Y-%m-%d') + '\''

        # create argument string
        sql_str += f'SELECT {self._list_to_sql_str(vars, table)} FROM {table} WHERE date BETWEEN {start_date_str} AND {end_date_str}'

        # additional subsetting
        if(specific_query): sql_str += f' AND {id_type} IN ({self._list_to_sql_str(ids)})'

        return(sql_str)
    
    
    # INTERNAL METHOD
    def _compustat_SQL_query(self, start_date: datetime.date, end_date: datetime.date, freq: str, vars: list, indfmt: list, datafmt: list, popsrc: list, consol: list, specific_query: bool, id_type: str, ids: list) -> str:
        """
        INTERNAL METHOD: Create SQL string used to query the WRDS Compustat database.

        Parameters
        ___________
        start_date: Starting date for the dataset queried.
        end_date: Ending date for the dataset queried.
        freq: The observational frequency of the query.
                Choices are:
                    * 'Q' : quarterly
                    * 'A' : annual

        Returns
        ________
        sql_str: String containing the SQL code used to query the specified Compustat database beased on
                    the start and end date and frequency given.
        """

        sql_str = ''
        table = f'COMPA_FUND{freq}'

        # convert date time object to strings for the SQL query
        start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\''
        end_date_str  = '\'' + end_date.strftime('%Y-%m-%d') + '\''

        # create argument string
        var_str = None
        if(vars is None):
            var_str = '*'
        else:
            var_str = self._list_to_sql_str(vars, table)
        sql_str += f'SELECT {var_str} FROM {table} WHERE datadate BETWEEN {start_date_str} AND {end_date_str}'

        # additional subsetting
        if(len(indfmt) != 0): sql_str += f' AND COMPA_FUND{freq}.indfmt IN ({self._list_to_sql_str(indfmt)})'
        if(len(datafmt) != 0): sql_str += f' AND COMPA_FUND{freq}.datafmt IN ({self._list_to_sql_str(datafmt)})'
        if(len(popsrc) != 0): sql_str += f' AND COMPA_FUND{freq}.popsrc IN ({self._list_to_sql_str(popsrc)})'
        if(len(consol) != 0): sql_str += f' AND COMPA_FUND{freq}.consol IN ({self._list_to_sql_str(consol)})'
        if(specific_query): sql_str += f' AND COMPA_FUND{freq}.{id_type} IN ({self._list_to_sql_str(ids)})'

        return(sql_str)

    # INTERNAL METHOD
    def _CRSP_SQL_query(self, start_date: datetime.date, end_date: datetime.date, freq: str, vars: list, exchcds: list, shrcds: list, specific_query: bool, id_type: str, ids: list) -> str:
        """
        INTERNAL METHOD: Create SQL string used to query the local WRDS CRSP monthly database.

        Parameters
        ___________
        start_date: Starting date for the dataset queried.

        end_date: Ending date for the dataset queried.

        freq: Observational frequency.
            Choices are:
                * 'D' : daily
                * 'M' : monthly

        Returns
        ________
        sql_str : str\n
            String containing the SQL code used to query the specified CRSP database beased on
                the observational frequency and WRDS update frequency of the CRSP database.

        Note
        _____
        Additonal to pulling the daily stock file (dsf) or the monthly stock file (msf) we also pull
            the daily or monthly stock events names file for the exchange code and the share code.
        """

        # table to query from
        sql_str = ''
        table = f'CRSP_{freq}'

        # convert date time object to strings for the SQL query
        start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\''
        end_date_str  = '\'' + end_date.strftime('%Y-%m-%d') + '\''

        # create argument string
        var_str = self._list_to_sql_str(vars, table)
        sql_str += f'SELECT {var_str} FROM {table} WHERE date BETWEEN {start_date_str} AND {end_date_str}'

        # additional subsetting
        if(len(exchcds) != 0): sql_str += f' AND exchcd in ({self._list_to_sql_str(exchcds)})'
        if(len(shrcds) != 0): sql_str += f' AND shrcd in ({self._list_to_sql_str(shrcds)})'
        if(specific_query): sql_str += f' AND {id_type} IN ({self._list_to_sql_str(ids)})'

        return(sql_str)

    # INTERNAL METHOD
    def _rf1m_SQL_query(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) -> str:
        """
        INTERNAL METHOD: Create SQL string used to query the Fama-French risk free rate
                            listed on WRDS CRSP in the FF library. This rate is the
                            1 month T-Bill rate.

        Parameters
        ___________
        start_date: str\n
            Starting date for the data being queried.

        end_date: str\n
            Ending date for the data being queried.

        obs_freq: str\n
            The observational frequency of the CRSP delisting database being queried.
                Choices are:
                    * 'D' : daily
                    * 'M' : monthly
                    * 'A' : annual

        Returns
        ________
        sql_str : str\n
            String containing the SQL code used to query the risk free rate in the
                Fama-French (FF) library on CRSP/WRDS database.

        Note
        _____
        Depending on the observational frequency (obs_freq) given the compounding of the
            risk-free rate changes.
        """
        # convert date time object to strings for the SQL query
        start_date_str = start_date.strftime('%Y-%m-%d')
        end_date_str  = end_date.strftime('%Y-%m-%d')

        # Depending on the frequency supplied the compounding changes
        if(obs_freq == 'D'):
            sql_1 = 'strftime(\'%d\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%d\', date) AS diff'
            sql_2 = 'rf AS cumrf'
            library = 'FF_FACTORS_DAILY'
        elif(obs_freq == 'M'):
            sql_1 = 'strftime(\'%m\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%m\', date) AS diff'
            sql_2 = 'rf AS cumrf'
            library = 'FF_FACTORS_MONTHLY'
        elif(obs_freq == 'A'):
            sql_1 = 'strftime(\'%Y\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%Y\', date) AS diff'
            sql_2 = 'EXP(SUM(LN(1 + rf)) OVER (PARTITION BY strftime(\'%Y\', date))) - 1 AS cumrf'
            library = 'FF_FACTORS_MONTHLY'
        else:
            cprint.fatal('No valid observational frequency given.', interrupt = True)

        sql_dic = {'sql_1' : sql_1, 'sql_2' : sql_2, 'library' : library, 'start_date' : '\'' + start_date_str + '\'', 'end_date' : '\'' + end_date_str + '\''}
        sql_str = """
                    SELECT date, rf
                    FROM (
                        SELECT date, {0}, rf, {1}
                        FROM {2}
                        WHERE date BETWEEN {3} AND {4}
                    ) AS crsp_rf
                    WHERE diff != 0 OR diff IS NULL
                  """.format(sql_dic['sql_1'], sql_dic['sql_2'], sql_dic['library'], sql_dic['start_date'], sql_dic['end_date'])
        return(sql_str)

Methods

def query_CCM(self, start_date: datetime.date, end_date: datetime.date, **kwargs) ‑> pandas.core.frame.DataFrame

Used to query the merged CRSP/Compustat (CCM) table.

Parameters


start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * Q: quarterly * A: annual

Keyword Arguments


vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for.

Note


The variables that can be queiried for are: 'gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', 'pstk', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'shrcd', 'exchcd', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev', 'pr2_12', 'pr1_1', 'pr13_60', 'prx2_12', 'prx1_1', 'prx13_60'

Note


If the frequency is quarterly the variables that can be queiried for are the same as the annual file except for 'pstkrv' and 'pstkl'.

Note


The defualt variables that are queried for are: 'date', 'gvkey', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe',
'cfp', 'inv', 'op', 'PR2_12', 'PR1_1', 'PR13_60'

Expand source code
def query_CCM(self, start_date: datetime.date, end_date: datetime.date, **kwargs) -> pd.DataFrame:
    """
    Used to query the merged CRSP/Compustat (CCM) table.

    Parameters
    ___________
    start_date: The starting date of the data query.
    end_date:   The ending date of the data query.
    freq:       The frequency of the data query.
        Choices are:
            * Q: quarterly
            * A: annual

    Keyword Arguments
    __________________
    vars:     list; The variables to query for.
    add_vars: list; Additional variables to query for ontop of the default variables.
    sub_vars: list; Variables to remove from the default variables.
    all_vars: bool; Set to true to query for all variables in the table.
    id_type:  str;  Type of ID used to query for specific assets.
        Choices are:
            * ticker
            * gvkey
            * permno
            * cusip
            * permco
    ids:      list; The ids of type 'id_type' to query for.

    Note
    _____
    The variables that can be queiried for are:
        'gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', 
        'pstk', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'shrcd', 'exchcd', 
        'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev', 'pr2_12', 'pr1_1', 'pr13_60', 'prx2_12', 'prx1_1', 'prx13_60'

    Note
    _____
    If the frequency is quarterly the variables that can be queiried for are the same as the annual file except
        for 'pstkrv' and 'pstkl'.

    Note
    _____
    The defualt variables that are queried for are:
        'date', 'gvkey', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe',  
        'cfp', 'inv', 'op', 'PR2_12', 'PR1_1', 'PR13_60'
    """
    # vars that can be queiried for
    VALID_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'dp', 'year', 'month', 'pr1_1', 'pr2_12', 'pr13_60', 'prx1_1', 'prx2_12', 'prx13_60', 
                  'me', 'ffdate', 'ffyear', 'ffmonth', 'months_in', 'wt', 'dec_me', 'dltt', 'mib', 'revt', 'csho', 'adjex_f', 'act', 'xint', 'pstk', 'txdi', 'gvkey', 'ib', 'xsga', 'dlc', 'ceq', 'che', 'datadate', 'txdc', 'dpc', 'ibc', 
                  'fyear', 'pstkl', 'teq', 'cogs', 'pstkrv', 'lct', 'dpre', 'txditc', 'seq', 'at', 'sale', 'year_end', 'years_in', 'ps', 'be', 'earn', 'profit', 'op', 'inv', 'cf', 'csho_adj', 'd_owcap_adj', 'ac', 'ni_csho_adj', 'nsi', 'ffbm', 
                  'bm', 'ffep', 'ep', 'ffcfp', 'cfp', 'beta']

    # for annual
    #VALID_VARS = ['gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', 
    #                  'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 
    #                  'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev']

    # if no keyword arguments are given then these are the defaults returned
    DEFAULT_VARS = ['date', 'gvkey', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'datadate', 'year_end', 'ffdate', 'prc', 'shrout', 'adjret', 'adjretx', 
                    'me', 'wt', 'dp', 'be', 'bm', 'ffbm', 'ep', 'ffep', 'cfp', 'ffcfp', 'inv', 'op', 'pr2_12', 'pr1_1', 'pr13_60', 'beta', 'ac', 'nsi', 'years_in', 'months_in', 'month', 'ffyear']

    VARS_DATA_TYPE = {'permno': str, 'permco': str, 'ticker': str, 'shrcd': str, 'exchcd': str, 'prc': float, 'shrout': float, 'adjret': float, 'adjretx': float, 'adjcumret': float, 
                      'adjcumretx': float, 'dp': float, 'year': int, 'month': int, 'pr1_1': float, 'pr2_12': float, 'pr13_60': float, 'prx1_1': float, 'prx2_12': float, 'prx13_60': float, 
                      'me': float, 'ffyear': int, 'ffmonth': int, 'months_in': int, 'wt': float, 'dec_me': float, 'dltt': float, 'mib': float, 'revt': float, 'csho': float, 'adjex_f': float, 
                      'act': float, 'xint': float, 'pstk': float, 'txdi': float, 'gvkey': str, 'ib': float, 'xsga': float, 'dlc': float, 'ceq': float, 'che': float, 'txdc': float, 'dpc': float, 'ibc': float, 
                      'fyear': int, 'pstkl': float, 'teq': float, 'cogs': float, 'pstkrv': float, 'lct': float, 'dpre': float, 'txditc': float, 'seq': float, 'at': float, 'sale': float, 'years_in': int, 
                      'ps': float, 'be': float, 'earn': float, 'profit': float, 'op': float, 'inv': float, 'cf': float, 'csho_adj': float, 'd_owcap_adj': float, 'ac': float, 'ni_csho_adj': float, 'nsi': float, 'ffbm': float, 
                      'bm': float, 'ffep': float, 'ep': float, 'ffcfp': float, 'cfp': float, 'beta': float}

    #############################################################################################################################################
    # Query Validation and Error Checking
    #############################################################################################################################################

    # keywrods 'additional_vars' and 'vars' cannot be used simultaneously
    if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')

    # create list of the variables being quireied 
    query_vars = None
    if('vars' in kwargs):
        # variable arguments to query for
        query_vars = kwargs['vars']

        # 'permno' needs to be in the query vars for merging
        if('permno' not in query_vars): query_vars.insert(0, 'permno')

        # add date if people forgot
        if('date' not in query_vars): query_vars.insert(0, 'date')
    else:
        if('add_vars' in kwargs):
            query_vars = DEFAULT_VARS + kwargs['add_vars']
        else:
            query_vars = DEFAULT_VARS
        if('sub_vars' in kwargs):
            query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']]
        
    if('all_vars' in kwargs): query_vars = VALID_VARS

    # make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it)
    all_valid = all(elem in VALID_VARS for elem in query_vars)
    if(not all_valid):
        incorrect_vars = list(set(query_vars) - set(VALID_VARS))
        raise Exception(f'Variables {incorrect_vars} cannot be queried from the combined CRSP/Compustat merged table. The CCM table does not contain all of the variables that are in CRSP and Compustat.')
    
    specific_query = False
    id_type = ''
    ids = []
    if('id_type' in kwargs or 'ids' in kwargs):
        if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
        if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
        specific_query = True
        id_type = kwargs['id_type']
        ids = kwargs['ids']

    ##############################################################################################################################################
    # Load the raw data
    ##############################################################################################################################################

    # read in raw dataframe from local sql database
    raw_df = pd.read_sql(self._CCM_sql_query(start_date, end_date, vars = query_vars, 
                                             specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)

    ##############################################################################################################################################
    # Clean the raw data
    ##############################################################################################################################################

    # I HATE U SEC 
    if(query_vars is None): 
        raw_df.fyear = raw_df.fyear.astype(float)
    if(not query_vars is None):
        if('fyear' in query_vars): 
            raw_df.fyear = raw_df.fyear.astype(float)

    # get vars in the dataframe
    quried_vars = list(set(list(raw_df.columns)) - set(['date', 'datadate', 'ffdate', 'year_end']))
    vars_dtypes = {}
    for var in quried_vars:
        vars_dtypes[var] = VARS_DATA_TYPE[var]

    # convert to correct data types
    raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d')
    raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d')
    raw_df.ffdate = pd.to_datetime(raw_df.ffdate, format = '%Y-%m-%d')
    raw_df.year_end = pd.to_datetime(raw_df.year_end, format = '%Y-%m-%d')
    raw_df = raw_df.astype(vars_dtypes)

    # replace and python objects 'None' to np.nan
    raw_df = raw_df.fillna(value = np.nan)

    # reset to original variables, drop duplicates, and reset the index
    raw_df = raw_df[query_vars]
    raw_df = raw_df.drop_duplicates()
    raw_df = raw_df.sort_values(by = ['permno', 'date'])
    raw_df = raw_df.reset_index(drop = True)

    # return dataframe
    return(raw_df)
def query_CRSP(self, start_date: datetime.date, end_date: datetime.date, freq: str, adj_stocksplit: bool = True, **kwargs) ‑> pandas.core.frame.DataFrame

Used to query the raw CRSP files. Additonally, variables can be created and stock splits can be adjusted for.

Parameters


start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * M: quarterly * D: annual adj_stocksplit: default = True; Whether or not to adjust for a stock split event.

Keyword Arguments


vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for. exchcds: list; The exchange codes to use for querying (default: NYSE, NYSE Market (formally AMEX), and NASDAQ) shrcds: list; The share codes to use for querying (default: US Common Stock)

Note


If the frequency is monthly then the variables that can be quiried for are: 'date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt'

Note


If the frequency is daily then the variables that can be quiried for are: 'date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt', 'openprc', 'numtrd'

Note


The variabels that are created from CRSP primatives are: * 'me': Market Equity (millions) * 'adjret': Returns adjusted for delisting events * 'adjretx': Returns adjusted for delisting events ex. dividend * 'dvd': Dividend (uses 'adjret' and 'adjretx' to calculate) * 'dp': Dividend-to-Price Ratio

Note


This function defaults to querying for all companies that are consistent with Famas defintions. That is to say assets with a share code of 10 or 11 and an exchange code of 1, 2, or 3.

Expand source code
def query_CRSP(self, start_date: datetime.date, end_date: datetime.date, freq: str, adj_stocksplit: bool = True, **kwargs) -> pd.DataFrame:
    """
    Used to query the raw CRSP files. Additonally, variables can be created and stock splits can be adjusted for.

    Parameters
    ___________
    start_date: The starting date of the data query.
    end_date:   The ending date of the data query.
    freq:       The frequency of the data query.
        Choices are:
            * M: quarterly
            * D: annual
    adj_stocksplit: default = True; Whether or not to adjust for a stock split event.

    Keyword Arguments
    __________________
    vars:     list; The variables to query for.
    add_vars: list; Additional variables to query for ontop of the default variables.
    sub_vars: list; Variables to remove from the default variables.
    all_vars: bool; Set to true to query for all variables in the table.
    id_type:  str;  Type of ID used to query for specific assets.
        Choices are:
                * ticker
                * gvkey
                * permno
                * cusip
                * permco
    ids:      list; The ids of type 'id_type' to query for.
    exchcds:  list; The exchange codes to use for querying (default: NYSE, NYSE Market (formally AMEX), and NASDAQ)
    shrcds:   list; The share codes to use for querying (default: US Common Stock) 

    Note
    _____
    If the frequency is monthly then the variables that can be quiried for are:
        'date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 
        'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 
        'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 
        'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt'

    Note
    _____
    If the frequency is daily then the variables that can be quiried for are:
        'date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 
        'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 
        'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 
        'nwperm', 'dlamt', 'openprc', 'numtrd'
        
    Note
    _____
    The variabels that are created from CRSP primatives are:
        * 'me':         Market Equity (millions)
        * 'adjret':     Returns adjusted for delisting events
        * 'adjretx':    Returns adjusted for delisting events ex. dividend
        * 'dvd':        Dividend (uses 'adjret' and 'adjretx' to calculate)
        * 'dp':         Dividend-to-Price Ratio 

    Note
    _____
    This function defaults to querying for all companies that are consistent with Famas defintions. That is to say assets with a share code of 10 or 11 
        and an exchange code of 1, 2, or 3.
    """
    # variables that can be queried for
    STD_VARS = None
    if(freq == 'M'):
        STD_VARS = ['date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 
                    'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 
                    'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 
                    'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt']
    else:
        STD_VARS = ['date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 
                    'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 
                    'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 
                    'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt', 'openprc', 'numtrd']

    CREATE_VARS = ['me', 'adjret', 'adjretx', 'dvd', 'dp', 'cumret', 'cumretx', 'adjcumret', 'adjcumretx']

    VALID_VARS = STD_VARS + CREATE_VARS

    # if no keyword arguments are given then these are the defaults returned
    DEFAULT_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'me', 'dp', 'vol']

    # variable data types
    VARS_DATA_TYPE = {'cusip': str, 'permno': str, 'permco' : str, 'comnam': str, 'compno': str, 'ticker': str, 
                      'primexch': str, 'tsymbol': str, 'secstat': str, 'hsiccd': str, 'naics': str, 'siccd': str, 'trdstat': str, 'ncusip': str,
                      'shrcd': str, 'exchcd': str, 'issuno': str, 'hexcd': str, 'shrcls': str, 
                      'ret': float, 'retx': float, 'shrout': float, 'prc': float, 'cfacshr': float, 'cfacpr': float, 
                      'bidlo': float, 'bid': float, 'ask': float, 'askhi': float, 'spread': float, 'altprc': float, 'vol': float,
                      'dlstdt': str, 'dlstcd': str, 'nwperm': str, 'nwcomp': str, 'nextdt': str, 'dlamt': float, 'dlretx': float, 'dlprc': float, 
                      'dlpdt': str, 'dlret': float, 'acperm': str, 'accomp': str, 'me': float, 'adjret': float, 'adjretx': float, 'dvd': float, 
                      'adjdvd': float, 'dp': float, 'openprc': float, 'numtrd': float, 'cumret': float, 'cumretx': float, 'adjcumret': float, 'adjcumretx': float}

    #############################################################################################################################################
    # Query Validation and Error Checking
    #############################################################################################################################################

    if(freq not in ['D', 'M']): raise Exception('Invlaid frequency given to query_CRSP')

    # keywrods 'additional_vars' and 'vars' cannot be used simultaneously
    if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')

    # create list of the variables being quireied 
    query_vars = None
    if('vars' in kwargs):
        # variable arguments to query for
        query_vars = kwargs['vars']

        # 'permno' needs to be in the query vars for merging
        if('permno' not in query_vars): query_vars.insert(0, 'permno')

        # add date if people forgot
        if('date' not in query_vars): query_vars.insert(0, 'date')
    else:
        if('add_vars' in kwargs):
            query_vars = DEFAULT_VARS + kwargs['add_vars']
        else:
            query_vars = DEFAULT_VARS
        if('sub_vars' in kwargs):
            query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']]
        

    if('all_vars' in kwargs): query_vars = VALID_VARS

    # used for dataframe formatting at the end
    og_vars = query_vars.copy()

    # make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it)
    all_valid = all(elem in VALID_VARS for elem in query_vars)
    if(not all_valid):
        incorrect_vars = list(set(query_vars) - set(VALID_VARS))
        raise Exception(f'Variables {incorrect_vars} cannot be queried from CRSP.')

    # always adjust for stock splits (can disable this)
    if(adj_stocksplit):
        if('prc' not in query_vars): query_vars.append('prc')
        if('cfacpr' not in query_vars): query_vars.append('cfacpr')
        if('shrout' not in query_vars): query_vars.append('shrout')
        if('cfacshr' not in query_vars): query_vars.append('cfacshr')

    # make sure if created variables are being queiried for then add the variables needed to create them
    if('me' in query_vars):
        if('prc' not in query_vars): query_vars.append('prc')
        if('shrout' not in query_vars): query_vars.append('shrout')

    if('dp' in query_vars):
        if('dvd' not in query_vars): query_vars.append('dvd')

    if('dvd' in query_vars):
        if('adjret' not in query_vars): query_vars.append('adjret')
        if('adjretx' not in query_vars): query_vars.append('adjretx')
        if('prc' not in query_vars): query_vars.append('prc')

    if('adjret' in query_vars):
        if('ret' not in query_vars): query_vars.append('ret')
        if('dlret' not in query_vars): query_vars.append('dlret')

    if('adjretx' in query_vars):
        if('retx' not in query_vars): query_vars.append('retx')
        if('dlretx' not in query_vars): query_vars.append('dlretx')

    if('cumret' in query_vars):
        if('ret' not in query_vars): query_vars.append('ret')

    if('cumretx' in query_vars):
        if('retx' not in query_vars): query_vars.append('retx')

    if('adjcumret' in query_vars):
        if('ret' not in query_vars): query_vars.append('ret')
        if('dlret' not in query_vars): query_vars.append('dlret')
        
    if('adjcumretx' in query_vars):
        if('retx' not in query_vars): query_vars.append('retx')
        if('dlretx' not in query_vars): query_vars.append('dlretx')


    exchcds = kwargs['exchcds'] if('exchcds' in kwargs) else [1, 2, 3] # default: NYSE, NYSE MKT, NASDAQ
    shrcds = kwargs['shrcds'] if('shrcds' in kwargs) else [10, 11]     # default: US-based common stock

    specific_query = False
    id_type = ''
    ids = []
    if('id_type' in kwargs or 'ids' in kwargs):
        if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
        if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
        specific_query = True
        id_type = kwargs['id_type']
        ids = kwargs['ids']

    # created vars are not in the table so remove them
    db_vars = [var for var in query_vars if var not in CREATE_VARS]

    ##############################################################################################################################################
    # Load the raw data
    ##############################################################################################################################################

    # read in raw dataframe from local sql database
    raw_df = pd.read_sql(self._CRSP_SQL_query(start_date, end_date, freq, vars = db_vars, exchcds = exchcds, shrcds = shrcds, 
                                              specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)

    ##############################################################################################################################################
    # Clean the raw data
    ##############################################################################################################################################

    # I HATE U SEC
    DOWNCAST_VARS = ['permno', 'permco', 'exchcd', 'issuno', 'hexcd', 'shrcd', 'compno', 'hsiccd', 'naics', 'siccd', 'acperm', 'accomp', 'dlstcd', 'nwcomp', 'nwperm']
    for var in DOWNCAST_VARS:
        if(var in query_vars):
            raw_df[var] = raw_df[var].astype('Int64')

    # get vars in the dataframe
    quried_vars = list(set(list(raw_df.columns)) - set(['altprcdt', 'date', 'nameendt', 'namedt', 'dlstdt', 'nextdt', 'dlpdt']))
    vars_dtypes = {}
    for var in quried_vars:
        vars_dtypes[var] = VARS_DATA_TYPE[var]

    # convert dates to datetime.dates and align to end of month 
    raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d')
    if(freq == 'M'): raw_df.date += MonthEnd(0)

    if('altprcdt' in query_vars): raw_df.altprcdt = pd.to_datetime(raw_df.altprcdt, format = '%Y-%m-%d')
    if('nameendt' in query_vars): raw_df.nameendt = pd.to_datetime(raw_df.nameendt, format = '%Y-%m-%d')
    if('namedt' in query_vars): raw_df.namedt = pd.to_datetime(raw_df.namedt, format = '%Y-%m-%d')
    if('dlstdt' in query_vars): raw_df.dlstdt = pd.to_datetime(raw_df.dlstdt, format = '%Y-%m-%d')
    if('nextdt' in query_vars): raw_df.nextdt = pd.to_datetime(raw_df.nextdt, format = '%Y-%m-%d')
    if('dlpdt' in query_vars): raw_df.dlpdt = pd.to_datetime(raw_df.dlpdt, format = '%Y-%m-%d')

    # make sure that the data is the correct type
    raw_df = raw_df.astype(vars_dtypes)

    # replace and python objects 'None' to np.nan
    raw_df = raw_df.fillna(value = np.nan)

    # adjust for stock splits
    if(adj_stocksplit):
        raw_df.prc /= raw_df.cfacpr
        raw_df.shrout *= raw_df.cfacshr

    # Market Equity. Market equity (size) is price times shares outstanding. Price and shares outstanding from CRSP.
    # SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html
    if('me' in query_vars):
        raw_df['me'] = raw_df.prc.abs() * raw_df.shrout
        # convert market equity to $millions
        raw_df.me /= 1e3

    # adjust for delisting return
    if('adjret' in query_vars):
        raw_df.dlret = raw_df.dlret.fillna(value = 0.0)
        raw_df['adjret'] = ((1 + raw_df.ret) * (1 + raw_df.dlret)) - 1
    if('adjretx' in query_vars):
        raw_df.dlretx = raw_df.dlretx.fillna(value = 0.0)
        raw_df['adjretx'] = ((1 + raw_df.retx) * (1 + raw_df.dlretx)) - 1

    # create dividends paid using 'adjret' and 'adjretx' then 'ret' and 'retx' in that order
    if('adjret' in query_vars and 'adjretx' in query_vars):
        raw_df['dvd'] = (raw_df.adjret - raw_df.adjretx) * raw_df.groupby(['permco'])['prc'].shift(1).abs()

    # create cummulative returns
    if('cumret' in query_vars):
        raw_df['1+ret'] = 1 + raw_df.ret
        raw_df['cumret'] = raw_df.groupby(by = ['permno'])['1+ret'].cumprod()
        raw_df = raw_df.drop(columns = ['1+ret'])

    if('cumretx' in query_vars):
        raw_df['1+retx'] = 1 + raw_df.ret
        raw_df['cumretx'] = raw_df.groupby(by = ['permno'])['1+retx'].cumprod()
        raw_df = raw_df.drop(columns = ['1+retx'])

    if('adjcumret' in query_vars):
        raw_df['1+adjret'] = 1 + raw_df.ret
        raw_df['adjcumret'] = raw_df.groupby(by = ['permno'])['1+adjret'].cumprod()
        raw_df = raw_df.drop(columns = ['1+adjret'])

    if('adjcumret' in query_vars):
        raw_df['1+adjretx'] = 1 + raw_df.ret
        raw_df['adjcumretx'] = raw_df.groupby(by = ['permno'])['1+adjretx'].cumprod()
        raw_df = raw_df.drop(columns = ['1+adjretx'])

    # Dividend Yield. The dividend yield used to form portfolios in June of year t is the total dividends paid from July of t-1 
            # to June of t per dollar of equity in June of t. The dividend yield is computed using the with and without dividend returns 
            # from CRSP, as described in Fama and French, 1988, “Dividend yields and expected stock returns,” Journal of Financial Economics 25.
            # SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html
    # NOTE: Following Fama Fench the dividend price ratio uses the last year of dividends paid out if possible with a minimum 7 months.
    if('dp' in query_vars):
        if(freq == 'M'):
            min_periods = 7
            window = 12
        else:
            min_periods = 147 # 252 days / 12 months * 7 months
            window = 252
        raw_df['cumdvd'] = raw_df.groupby(['permno'])['dvd'].rolling(min_periods = min_periods, window = window).sum().reset_index(level = 'permno')[['dvd']]
        raw_df['dp'] = raw_df.cumdvd / raw_df.prc.abs()
        raw_df.dp = np.where((raw_df.dp.isnull()) | (raw_df.dp < 0), np.nan, raw_df.dp)
        raw_df = raw_df.drop(columns = ['cumdvd'])

    # reset to original variables, drop duplicates, and reset the index
    raw_df = raw_df[og_vars]
    raw_df = raw_df.drop_duplicates()
    raw_df = raw_df.sort_values(by = ['permno', 'date'])
    raw_df = raw_df.reset_index(drop = True)

    # return the raw dataframe and path where it was saved
    return(raw_df)
def query_Compustat(self, start_date: datetime.date, end_date: datetime.date, freq: str, **kwargs) ‑> pandas.core.frame.DataFrame

Used to query the raw Compustat tables.

Parameters


start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * Q: quarterly * A: annual

Keyword Arguments


vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for.

Note


The defualt variables that are queired for from the quarterly file have their names changed to mirror those in the annual file. In most cases this means removing a 'q' at the end of the variable name. For example, in the annual file the fiscal year variable is 'fyear' while in the quarterly file the name is 'fyearq'. This name change is done to the dataframe that will be returned in RAM and not to the underlying Compustat table on DISK. The change is done to make it easier to compute the anomally characterisitcs when creating the combined CCM tables.

Note


By use of the 'add_vars' or 'vars' keyword arguments you can query for the approximately 1000 variables that Compustat tracks. To do this you need to know the actual name of the varibale that you want to query for, paying attention to Compustat's naming conventions between their annual and quarterly files.

Note


The defualt variables that are queried for are if the frequency given is annual: 'gvkey', 'date', 'fyear', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dp', 'che', 'dlc', 'ceq', 'seq', 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib'

If the frequency is quarterly it is the same variables excluding 'pstkrv' and 'pstkl'.

Note


There is less error checking in this function compared to the other methods in this class because of the large number of variables in Compustat.

Expand source code
def query_Compustat(self, start_date: datetime.date, end_date: datetime.date, freq: str, **kwargs) -> pd.DataFrame:
    """
    Used to query the raw Compustat tables.

    Parameters
    ___________
    start_date: The starting date of the data query.
    end_date:   The ending date of the data query.
    freq:       The frequency of the data query.
        Choices are:
            * Q: quarterly
            * A: annual

    Keyword Arguments
    __________________
    vars:     list; The variables to query for.
    add_vars: list; Additional variables to query for ontop of the default variables.
    sub_vars: list; Variables to remove from the default variables.
    all_vars: bool; Set to true to query for all variables in the table.
    id_type:  str;  Type of ID used to query for specific assets.
        Choices are:
            * ticker
            * gvkey
            * permno
            * cusip
            * permco
    ids:      list; The ids of type 'id_type' to query for.

    Note
    _____
    The defualt variables that are queired for from the quarterly file have their names changed to mirror those in the annual
        file. In most cases this means removing a 'q' at the end of the variable name. For example, in the annual file the 
        fiscal year variable is 'fyear' while in the quarterly file the name is 'fyearq'. This name change is done to 
        the dataframe that will be returned in RAM and not to the underlying Compustat table on DISK. The change is done to make 
        it easier to compute the anomally characterisitcs when creating the combined CCM tables.  

    Note
    _____
    By use of the 'add_vars' or 'vars' keyword arguments you can query for the approximately 1000 variables that Compustat tracks.
        To do this you need to know the actual name of the varibale that you want to query for, paying attention to Compustat's
        naming conventions between their annual and quarterly files.
        

    Note
    _____
    The defualt variables that are queried for are if the frequency given is annual:
        'gvkey', 'date', 'fyear', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dp', 'che', 'dlc', 'ceq', 'seq', 
        'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib'

        If the frequency is quarterly it is the same variables excluding 'pstkrv' and 'pstkl'.

    Note
    _____
    There is less error checking in this function compared to the other methods in this class because of the large number of variables
        in Compustat.
    """
    STD_VARS = None
    if(freq == 'A'):
        STD_VARS = ['gvkey', 'datadate', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdc', 'dpc', 'che', 'dlc', 'ceq', 'seq', 'teq', 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'ibc', 'dltt', 'mib', 'ib', 'dp']
    else:
        STD_VARS = ['gvkey', 'datadate', 'tic', 'atq', 'saleq', 'cogsq', 'actq', 'txdiq', 'cshoq', 'lctq', 'txdcy', 'dpcy', 'cheq', 'dlcq', 'ceqq', 'seqq', 'teqq', 'pstkq', 'txditcq', 'xintq', 'xsgaq', 'ibcy', 'dlttq', 'mibq', 'ibq', 'dpq']

    DEFAULT_DTYPES = {'gvkey': str, 'ticker': str, 'at': float, 'sale': float, 'cogs': float, 'act': float, 'txdi': float, 'csho': float, 'lct': float, 'dltt': float, 'mib': float,
                      'txdc': float, 'dpre': float, 'che': float, 'dlc': float, 'ceq': float, 'seq': float, 'teq': float, 'pstk': float, 'txditc': float, 'xint': float, 'xsga': float, 'ibc': float, 'ib': float}

    CREATED_VARS = ['years_in']

    #############################################################################################################################################
    # Query Validation and Error Checking
    #############################################################################################################################################

    if(freq not in ['Q', 'A']): raise Exception('Invlaid frequency given to query_compustat')

    # keywrods 'additional_vars' and 'vars' cannot be used simultaneously
    if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')

    # create list of the variables being quireied 
    query_vars = None
    if('vars' in kwargs):
        # variable arguments to query for
        query_vars = kwargs['vars']

        # 'permno' needs to be in the query vars for merging
        if('gvkey' not in query_vars): query_vars.insert(0, 'gvkey')

        # add date if people forgot
        if('datadate' not in query_vars and 'date' not in query_vars): query_vars.insert(0, 'datadate')
    else:
        if('add_vars' in kwargs):
            query_vars = STD_VARS + kwargs['add_vars']
        else:
            query_vars = STD_VARS
        if('sub_vars' in kwargs):
            sub_vars = ['tic' if elem == 'ticker' else elem for elem in kwargs['sub_vars']]
            query_vars = [elem for elem in query_vars if elem not in sub_vars]

    query_vars = ['datadate' if elem == 'date' else elem for elem in query_vars]
    query_vars = ['tic' if elem == 'ticker' else elem for elem in query_vars]
    query_vars = ['conm' if elem == 'comnam' else elem for elem in query_vars]

    if('all_vars' in kwargs): query_vars = None

    indfmts = kwargs['indfmts'] if('indfmts' in kwargs) else ['INDL']     # default: Industrial, Financial
    datafmts = kwargs['datafmts'] if('datafmts' in kwargs) else ['STD']         # default: Standard
    popsrcs = kwargs['popsrcs'] if('popsrcs' in kwargs) else ['D']              # default: Consolidated
    consols = kwargs['consols'] if('consols' in kwargs) else ['C']              # default: Consolidated

    specific_query = False
    id_type = ''
    ids = []
    if('id_type' in kwargs or 'ids' in kwargs):
        if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
        if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
        specific_query = True
        id_type = kwargs['id_type']
        if(id_type == 'ticker'): id_type = 'tic'
        ids = kwargs['ids']

    query_vars_DB = list(set(query_vars) - set(CREATED_VARS))

    ##############################################################################################################################################
    # Load the raw data
    ##############################################################################################################################################

    # read in raw dataframe from local sql database
    raw_df = pd.read_sql(self._compustat_SQL_query(start_date, end_date, freq, vars = query_vars_DB, 
                                                   indfmt = indfmts, datafmt = datafmts, popsrc = popsrcs, consol = consols,
                                                   specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)


    ##############################################################################################################################################
    # Clean the raw data
    ##############################################################################################################################################

    # rename columns
    raw_df = raw_df.rename(columns = {'tic': 'ticker', 'conm':'comnam'})

    # rename the default columns to match the names from the COMPA_FUNDA
    if(freq == 'Q'):
        # quarterly compustat
        # dont balme me for the different names blame compustat
        raw_df = raw_df.rename(columns = {'atq':'at', 'seqq':'seq', 'ceqq':'ceq', 'teqq':'teq',
                                          'pstkq':'pstk', 'txdcy':'txdc', 'txditcq':'txditc', 'saleq':'sale',
                                          'cogsq':'cogs', 'xintq':'xint', 'xsgaq':'xsga', 'mibq':'mib', 
                                          'ibcy':'ibc', 'txdiq':'txdi', 'dpq':'dpre', 'cshoq':'csho', 'adjex':'adjex_f',
                                          'actq':'act', 'lctq':'lct', 'cheq':'che', 'dlcq':'dlc', 'dlttq': 'dltt', 'ibq': 'ib'})
    else:
        # annual compustat
        # rename columns for consistency
        raw_df = raw_df.rename(columns = {'dp': 'dpre'})

    # make date a datetime.date and align to the end of the year/quarter
    raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d')
    if(freq == 'A'):
        raw_df['year_end'] = raw_df.datadate + YearEnd(0)
    else:
        raw_df['quarter_end'] = raw_df.datadate + QuarterEnd(0)

    # I HATE U S&P
    if('fyear' in query_vars): 
        raw_df.fyear = raw_df.fyear.astype(float)

    # add years in to compustat
    if('years_in' in query_vars):
        raw_df['years_in'] = raw_df.groupby(by = ['gvkey']).cumcount()
            
    # get vars in the dataframe
    quried_vars = list(set(list(raw_df.columns)) - set(['date']))
    vars_dtypes = {}
    for var in quried_vars:
        if(var in DEFAULT_DTYPES):
            vars_dtypes[var] = DEFAULT_DTYPES[var]

    # convert dtypes
    raw_df = raw_df.fillna(value = np.nan)
    raw_df = raw_df.astype(vars_dtypes)

    # sort just for ease of reading
    raw_df = raw_df.drop_duplicates()
    sorting_dims = ['gvkey', 'year_end'] if(freq == 'A') else ['gvkey', 'quarter_end']
    raw_df = raw_df.sort_values(by = sorting_dims)
    raw_df = raw_df.reset_index(drop = True)

    # return the dataframe
    return(raw_df)

Query the CRSP/Compustat (CCM) Merged Linking Table needed to merge CRSP securities to Compustat companies on permno and gvkey.

Returns


raw_df: pd.DataFrame

The raw dataframe pulled from local WRDS database.

Note


Currently this function only works if a local copy of the WRDS database exits w/ the CCM Linktable.

Expand source code
def query_link_table(self) -> pd.DataFrame:
    """
    Query the CRSP/Compustat (CCM) Merged Linking Table needed to merge CRSP securities to
        Compustat companies on permno and gvkey.

    Returns
    ________
    raw_df: pd.DataFrame\n
        The raw dataframe pulled from local WRDS database.

    Note
    _____
    Currently this function only works if a local copy of the WRDS database exits w/ the CCM Linktable.
    """
    sql_str = """
                SELECT gvkey, lpermno as permno, lpermco as permco, linktype, linkprim, linkdt, linkenddt
                FROM CRSP_CCMXPF_LINKTABLE
                WHERE substr(linktype, 1, 1) = 'L'
                AND (linkprim = 'C' or linkprim = 'P')
              """

    # read in raw dataframe from local database
    raw_df = pd.read_sql(sql_str, con = self.sql_engine)

    # convert permno and permco to string
    raw_df.permco = pd.to_numeric(raw_df.permco, downcast = 'integer')
    raw_df.permno = pd.to_numeric(raw_df.permno, downcast = 'integer')

    # convert identifiers to strings
    raw_df.gvkey = raw_df.gvkey.astype(str)
    raw_df.permno = raw_df.permno.astype(str)
    raw_df.permco = raw_df.permco.astype(str)

    # if linkenddt is missing the set to todays date
    raw_df.linkenddt = raw_df.linkenddt.fillna(pd.to_datetime('today').date())

    # convert to datetimes
    raw_df.linkdt = raw_df.linkdt.astype('datetime64[ns]')
    raw_df.linkenddt = raw_df.linkenddt.astype('datetime64[ns]')

    # return the raw dataframe
    return(raw_df)
def query_riskfree(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) ‑> pandas.core.frame.DataFrame

Query the risk-free rate from the Fama-French library on local WRDS. This rate is equivalent to the 1 month T-Bill rate.

Parameters


start_date: datetime.date

Starting date of the dataset being queried.

end_date: datetime.date

Ending date of the dataset being queried.

obs_freq: str

The observational frequency of the CRSP database being queried.
    Choices are:
        * 'D' : daily
        * 'M' : monthly
        * 'A' : annually

Returns


full_df: pd.DataFrame

Risk-free rate data.

Note


The dataframe returned makes adjustments for NYSE holidays during compounding.

Note


List of queried CRSP variables:

* date : Date of observation
* rf   : Risk-free rate
Expand source code
def query_riskfree(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) -> pd.DataFrame:
    """
    Query the risk-free rate from the Fama-French library on local WRDS. This rate is equivalent to the 1 month T-Bill rate.

    Parameters
    ___________
    start_date: datetime.date\n
        Starting date of the dataset being queried.

    end_date: datetime.date\n
        Ending date of the dataset being queried.

    obs_freq: str\n
        The observational frequency of the CRSP database being queried.
            Choices are:
                * 'D' : daily
                * 'M' : monthly
                * 'A' : annually

    Returns
    ________
    full_df: pd.DataFrame\n
        Risk-free rate data.

    Note
    _____
    The dataframe returned makes adjustments for NYSE holidays during compounding.

    Note
    _____
    List of queried CRSP variables:\n
        * date : Date of observation
        * rf   : Risk-free rate
    """
    # Since monthly observations have a date starting on the 1st of each month, then for any 'start_date' that doesn't
    # coincide w/ the 1st of any month, we adjust it so it does and the query pulls the monthly observation of interest.
    if(obs_freq in ['M', 'A'] and start_date != (start_date + MonthBegin(-1)).date()):
        start_date = (start_date + MonthBegin(-1)).date()

    # load in dataframe
    raw_df = pd.read_sql(self._rf1m_SQL_query(start_date, end_date, obs_freq), con = self.sql_engine)

    # convert dates to datetimes
    raw_df['date'] = pd.to_datetime(raw_df['date'])

    # Convert trading dates to end-of-period if 'freq' does not pertain to daily frequency.
    if(obs_freq == 'M'):
        raw_df['date'] = raw_df['date'] + MonthEnd(0)
    elif(obs_freq == 'A'):
        raw_df['date'] = raw_df['date'] + YearEnd(0)

    # return the raw dataframe
    return(raw_df)
def raw_sql(self, sql_str)

Allows the user to use raw SQL on the underlying database.

Note


This can cause irreversible damage to the underlying database that can only be fixed by deleting and reconstructing the database.

Expand source code
def raw_sql(self, sql_str):
    """
    Allows the user to use raw SQL on the underlying database.

    Note
    _____
    This can cause irreversible damage to the underlying database that can only be fixed by deleting and reconstructing the database.
    """
    cprint.warn('The operation that you are about to perform might damage the local database. Do you wish to continue [y/n]:')
    response = input()
    if(response == 'y'):
        raw_df = pd.read_sql(sql_str, con = self.sql_engine)
        return(raw_df)
    else:
        cprint.info('Operation cancelled.')
        return(None)