Module FamaFrench

Expand source code
import numpy as np
import pandas as pd
import QueryWRDS
import datetime
import pathlib
from shutil import get_terminal_size
import py_functions
import itertools
from tqdm import tqdm
import functools
from pandas.tseries.offsets import *
from cprint import *

# set printing options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', get_terminal_size()[0])
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# suppress chained assignment warning
pd.options.mode.chained_assignment = None

class FamaFrench:

    # TODO (1) Error Checking
    #      (2) Industry Sorts

    def __init__(self, wrds_username: str, db_path: pathlib.Path):
        self.DB = QueryWRDS.QueryWRDS(wrds_username, local_db_path = db_path)

    # Helper Function
    def _safe_append(self, obj, chk, ins):
        if(not chk in obj):
            obj[chk] = [ins]
        else:
            obj[chk].append(ins)
        return(obj)

    # Helper Function
    def _portfolio_return(self, df, ret_type, weight_type, name):
        if(weight_type == 'vw'):
                mkt_s = df.groupby('date').apply(py_functions.wavg, ret_type, 'me')
        else:
            mkt_s = df.groupby('date').mean(numeric_only = True)[ret_type]
        mkt_s.name = name
        mkt_s = mkt_s.to_frame().reset_index()
        return(mkt_s)

    '''
    Valid Factors: MKT, RF, MKT_RF, SMB3, SMB5, HML, RMW, CMA, MOM, ST_REV, LT_REV 
    '''
    # TODO: Check for none dates
    def FF_factors(self, factors: list[str], 
                   dfin = None, 
                   start_date: datetime.datetime = None, 
                   end_date: datetime.datetime = None, 
                   weight_type: str = 'vw', 
                   ret_type: str = 'adjret', 
                   drop_na: bool = True
        ) -> pd.DataFrame:
        """Creates standard Fama-French factors
        
        Creates the Fama-French factors using the original accounting practices from
        Eugene Fama's and Kenneth French's original 1992 paper. 
        The Cross-Section of Expected Stock Returns https://doi.org/10.1111/j.1540-6261.1992.tb04398.x
        
        Constructable factors include: 'MKT' market return, 'RF' risk free rate, 'MKT_RF' equity premium, 
        'SMB3' 3 factor small minus big, 'SMB5' 5 factor small minus big, 'HML' high minus low, 
        'RMW' robust minus weak, 'CMA' conservative minus aggresive, 'MOM' momentum, 
        'ST_REV' short term reversal, 'LT_Rev' long term reversal. See https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html
        for constrution notes.
        
        Args:
            factors: list of factors
            dfin: datadrame with assets to use (optional)
            start_date: start date for factors (optional)
            end_date: end date for factors (optional)
            weight_type: weights used to calculate returns
            ret_type: return type with or without dividends
            drop_na: if true drop rows that have NaN values

        Returns: 
            A dataframe with the specifed factros and a date column. Dataframe sorted by date.

        Example:
            Create the original 3 factor Fama-French model between 'date1' and 'date2'

            df = FamaFrench.FF_factors(
                factors = ['MKT_RF', 'SMB3', 'HML'], 
                start_date = date1, 
                end_date = date2
            )

        TODO:
            Error checking
        """
        
        # query DB if no dataframe supplied
        if(dfin is None):
            ccm_df = self.DB.query_CCM(start_date, end_date)
        else:
            ccm_df = dfin 
            
        # create resulting dataframe
        res = pd.DataFrame()
        date_s = ccm_df.date.unique()
        res['date'] = date_s
        res = res.sort_values(by = ['date'])

        # extract state and end date from dataframe
        if(not dfin is None):
            start_date = np.min(res.date)
            end_date = np.max(res.date)

        # calculate the market return of supplied assets
        if('MKT' in factors):
            mkt_df = self._portfolio_return(ccm_df, ret_type, weight_type, 'MKT')
            res = res.merge(mkt_df, how = 'left', on = ['date'])

        # add the risk free rate
        if('RF' in factors):
            rf_df = self.DB.query_riskfree(start_date, end_date, 'M')
            rf_df = rf_df.rename(columns = {'rf': 'RF'}) # just for naming consistency
            res = res.merge(rf_df, on = ['date'], how = 'left')

        # add the market premium
        if('MKT_RF' in factors):
            if('MKT' in factors and 'RF' in factors):
                res['MKT_RF'] = res.MKT - res.RF
            elif('MKT' in factors and not 'RF' in factors):
                rf_df = self.DB.query_riskfree(start_date, end_date, 'M')
                rf_df = rf_df.rename(columns = {'rf': 'RF'}) # just for naming consistency
                res = res.merge(rf_df, on = ['date'], how = 'left')
                res['MKT_RF'] = res.MKT - res.RF
                res = res.drop(columns = ['MKT', 'RF'])
            elif(not 'MKT' in factors and 'RF' in factors):
                mkt_df = self._portfolio_return(ccm_df, ret_type, weight_type, 'MKT')
                res = res.merge(mkt_df, how = 'left', on = ['date'])
                res['MKT_RF'] = res.MKT - res.RF
                res = res.drop(columns = ['MKT', 'RF'])
            else:
                mkt_df = self._portfolio_return(ccm_df, ret_type, weight_type, 'MKT')
                res = res.merge(mkt_df, how = 'left', on = ['date'])
                rf_df = self.DB.query_riskfree(start_date, end_date, 'M')
                rf_df = rf_df.rename(columns = {'rf': 'RF'}) # just for naming consistency
                res = res.merge(rf_df, on = ['date'], how = 'left')
                res['MKT_RF'] = res.MKT - res.RF
                res = res.drop(columns = ['MKT', 'RF'])

        # SMB factor from the 3-factor Fama-French model
        if('SMB3' in factors):
            # portfolio sorts on ME and BM
            sorts_df = self.sort_portfolios(
                stocks = ccm_df, char_bkpts = {'me': [0.5], 'ffbm': [0.3, 0.7]},
                sorting_funcs = {'me': self.sort_50, 'ffbm': self.sort_3070},
                drop_na = False, rebalance_freq = 'A'
            )
            sorts_df['SMB3'] = sorts_df[['me1_ffbm1', 'me1_ffbm2', 'me1_ffbm3']].mean(axis = 1) - sorts_df[['me2_ffbm1', 'me2_ffbm2', 'me2_ffbm3']].mean(axis = 1)
            res = res.merge(sorts_df[['date', 'SMB3']], how = 'left', on = ['date'])

        # SMB factor from the 5-factor Fama-French model
        if('SMB5' in factors):
            # sorts on BM
            sortsBM_df = self.sort_portfolios(
                stocks = ccm_df, char_bkpts = {'me': [0.5], 'ffbm': [0.3, 0.7]}, 
                sorting_funcs = {'me': self.sort_50, 'ffbm': self.sort_3070}, 
                drop_na = False, rebalance_freq = 'A'
            )

            # sorts on OP
            sortsOP_df = self.sort_portfolios(
                stocks = ccm_df, char_bkpts = {'me': [0.5], 'op': [0.3, 0.7]}, 
                sorting_funcs = {'me': self.sort_50, 'op': self.sort_3070}, 
                drop_na = False, rebalance_freq = 'A'
            )

            # sorts on INV
            sortsINV_df = self.sort_portfolios(
                stocks = ccm_df, char_bkpts = {'me': [0.5], 'inv': [0.3, 0.7]}, 
                sorting_funcs = {'me': self.sort_50, 'inv': self.sort_3070}, 
                drop_na = False, rebalance_freq = 'A'
            )

            # combine sorts into one dataframe
            sortsBM_df = sortsBM_df.merge(sortsOP_df, how = 'left', on = ['date'])
            sortsBM_df = sortsBM_df.merge(sortsINV_df, how = 'left', on = ['date'])

            # housekeeping
            sortsBM_df = sortsBM_df.set_index('date')
            sortsBM_df = sortsBM_df.dropna(how = 'all')

            # create factors
            sortsBM_df['SMB_BM'] = sortsBM_df[['me1_ffbm1', 'me1_ffbm2', 'me1_ffbm3']].mean(axis = 1) - sortsBM_df[['me2_ffbm1', 'me2_ffbm2', 'me2_ffbm3']].mean(axis = 1)
            sortsBM_df['SMB_OP'] = sortsOP_df[['me1_op1', 'me1_op2', 'me1_op3']].mean(axis = 1) - sortsOP_df[['me2_op1', 'me2_op2', 'me2_op3']].mean(axis = 1)
            sortsBM_df['SMB_INV'] = sortsINV_df[['me1_inv1', 'me1_inv2', 'me1_inv3']].mean(axis = 1) - sortsINV_df[['me2_inv1', 'me2_inv2', 'me2_inv3']].mean(axis = 1)

            # average factors
            sortsBM_df['SMB5'] = sortsBM_df[['SMB_BM', 'SMB_OP', 'SMB_INV']].mean(axis = 1)

            # add to result dataframe
            sortsBM_df = sortsBM_df.reset_index()
            res = res.merge(sortsBM_df[['date', 'SMB5']], how = 'left', on = ['date'])

        if('HML' in factors):
            sortsBM_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'ffbm': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'ffbm': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsBM_df['HML'] = (1/2) * (sortsBM_df.me1_ffbm3 + sortsBM_df.me2_ffbm3) - (1/2) * (sortsBM_df.me1_ffbm1 + sortsBM_df.me2_ffbm1)
            res = res.merge(sortsBM_df[['date', 'HML']], how = 'left', on = ['date'])

        if('RMW' in factors):
            sortsOP_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'op': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'op': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsOP_df['RMW'] = (1/2) * (sortsOP_df.me1_op3 + sortsOP_df.me2_op3) - (1/2) * (sortsOP_df.me1_op1 + sortsOP_df.me2_op1)
            res = res.merge(sortsOP_df[['date', 'RMW']], how = 'left', on = ['date'])

        if('CMA' in factors):
            sortsINV_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'inv': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'inv': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsINV_df['CMA'] = (1/2) * (sortsINV_df.me1_inv1 + sortsINV_df.me2_inv1) - (1/2) * (sortsINV_df.me1_inv3 + sortsINV_df.me2_inv3)
            res = res.merge(sortsINV_df[['date', 'CMA']], how = 'left', on = ['date'])

        if('MOM' in factors):
            sortsPR2_12_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'pr2_12': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'pr2_12': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsPR2_12_df['MOM'] = (1/2) * (sortsPR2_12_df.me1_pr2_123 + sortsPR2_12_df.me2_pr2_123) - (1/2) * (sortsPR2_12_df.me1_pr2_121 + sortsPR2_12_df.me2_pr2_121)
            res = res.merge(sortsPR2_12_df[['date', 'MOM']], how = 'left', on = ['date'])

        if('ST_REV' in factors):
            sortsPR1_1_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'pr1_1': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'pr1_1': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsPR1_1_df['ST_REV'] = (1/2) * (sortsPR1_1_df.me1_pr1_11 + sortsPR1_1_df.me2_pr1_11) - (1/2) * (sortsPR1_1_df.me1_pr1_13 + sortsPR1_1_df.me2_pr1_13)
            res = res.merge(sortsPR1_1_df[['date', 'ST_REV']], how = 'left', on = ['date'])

        if('LT_REV' in factors):
            sortsPR13_60_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'pr13_60': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'pr13_60': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsPR13_60_df['LT_REV'] = (1/2) * (sortsPR13_60_df.me1_pr13_601 + sortsPR13_60_df.me2_pr13_601) - (1/2) * (sortsPR13_60_df.me1_pr13_603 + sortsPR13_60_df.me2_pr13_603)
            res = res.merge(sortsPR13_60_df[['date', 'LT_REV']], how = 'left', on = ['date'])

        res = res.set_index('date').sort_index()
        if(drop_na): res = res.dropna(how = 'all')
        return(res)



    def FF_3factor(self, start_date: datetime.datetime = None, end_date: datetime.datetime = None, weigth_type: str = 'vw', ret_type: str = 'adjret', drop_na: bool = True, dfin = None):
        return(self.FF_factors(factors = ['MKT_RF', 'SMB3', 'HML'], 
                               dfin = dfin,
                               start_date = start_date, end_date = end_date, 
                               weight_type = weigth_type, ret_type = ret_type, 
                               drop_na = drop_na))

    def FF_5factor(self, start_date: datetime.datetime = None, end_date: datetime.datetime = None, weigth_type: str = 'vw', ret_type: str = 'adjret', drop_na: bool = True, dfin = None):
        return(self.FF_factors(factors = ['MKT_RF', 'SMB5', 'HML', 'CMA', 'RMW'], 
                               dfin = dfin,
                               start_date = start_date, end_date = end_date, 
                               weight_type = weigth_type, ret_type = ret_type, 
                               drop_na = drop_na))
    
    def breakpoint_ts(self, df_in, vars, qtiles = None):
        
        DEFAULT_QTILES = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
        DECILES_QTILES = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        QUINTIL_QTILES = [0.2, 0.4, 0.6, 0.8]

        dict_in = {}
        if(type(vars) is dict):
            dict_in = vars
        else:
            if(type(qtiles) is int or qtiles is None):
                for var in vars:
                    if(qtiles == 5):
                        dict_in[var] = QUINTIL_QTILES
                    elif(qtiles == 10):
                        dict_in[var] = DECILES_QTILES
                    else:
                        dict_in[var] = DEFAULT_QTILES
            elif(type(qtiles) is list):
                for var in vars:
                    dict_in[var] = qtiles
            else:
                raise TypeError("No valid vars or qtile combination given.")

        res = []
        for var, qtiles in dict_in.items():
            temp = df_in.groupby('date')[var].describe(percentiles = qtiles)
            ptiles = [f'{int(100 * q)}%' for q in qtiles]
            temp = temp[ptiles]
            temp = temp.add_prefix(f'{var}_')
            res.append(temp)

        fin = functools.reduce(lambda x, y: pd.merge(x, y, on = 'date'), res)
        fin = fin.reset_index()
        return(fin)
    

    # sorting functions
    def sort_50(self, row, var):
        if(row[var] < row[f'{var}_50%']):
            res = f'{var}1'
        elif(row[var] >= row[f'{var}_50%']):
            res = f'{var}2'
        else:
            res = '--fail'
        return(res)
    
    def sort_050(self, row, var):
        if(row[var] < 0):
            res = f'{var}1'
        if(row[var] >= 0 and row[var] < row[f'{var}_50%']):
            res = f'{var}2'
        elif(row[var] >= row[f'{var}_50%']):
            res = f'{var}3'
        else:
            res = '--fail'
        return(res)
    
    def sort_3070(self, row, var):
        if(row[var] < row[f'{var}_30%']):
            res = f'{var}1'
        elif(row[var] >= row[f'{var}_30%'] and row[var] < row[f'{var}_70%']):
            res = f'{var}2'
        elif(row[var] >= row[f'{var}_70%']):
            res = f'{var}3'
        else:
            res = '--fail'
        return(res)
    
    def sort_03070(self, row, var):
        if(row[var] <= 0):
            res = f'{var}1'
        elif(row[var] >= 0 and row[var] < row[f'{var}_30%']):
            res = f'{var}2'
        elif(row[var] >= row[f'{var}_30%'] and row[var] < row[f'{var}_70%']):
            res = f'{var}3'
        elif(row[var] >= row[f'{var}_70%']):
            res = f'{var}4'
        else:
            res = '--fail'
        return(res)
    
    def sort_quintile(self, row, var):
        if(row[var] < row[f'{var}_20%']):
            res = f'{var}1'
        elif(row[var] >= row[f'{var}_20%'] and row[var] < row[f'{var}_40%']):
            res = f'{var}2'
        elif(row[var] >= row[f'{var}_40%'] and row[var] < row[f'{var}_60%']):
            res = f'{var}3'
        elif(row[var] >= row[f'{var}_60%'] and row[var] < row[f'{var}_80%']):
            res = f'{var}4'
        elif(row[var] >= row[f'{var}_80%']):
            res = f'{var}5'
        else:
            res = '--fail'
        return(res)
    
    def sort_deciles(self, row, var):
        if(row[var] < row[f'{var}_10%']):
            res = f'{var}1'
        elif(row[var] >= row[f'{var}_10%'] and row[var] < row[f'{var}_20%']):
            res = f'{var}2'
        elif(row[var] >= row[f'{var}_20%'] and row[var] < row[f'{var}_30%']):
            res = f'{var}3'
        elif(row[var] >= row[f'{var}_30%'] and row[var] < row[f'{var}_40%']):
            res = f'{var}4'
        elif(row[var] >= row[f'{var}_40%'] and row[var] < row[f'{var}_50%']):
            res = f'{var}5'
        elif(row[var] >= row[f'{var}_50%'] and row[var] < row[f'{var}_60%']):
            res = f'{var}6'
        elif(row[var] >= row[f'{var}_60%'] and row[var] < row[f'{var}_70%']):
            res = f'{var}7'
        elif(row[var] >= row[f'{var}_70%'] and row[var] < row[f'{var}_80%']):
            res = f'{var}8'
        elif(row[var] >= row[f'{var}_80%'] and row[var] < row[f'{var}_90%']):
            res = f'{var}9'
        elif(row[var] >= row[f'{var}_90%']):
            res = f'{var}10'
        else:
            res = '--fail'
        return(res)

    def sort_portfolios(self, stocks, char_bkpts, sorting_funcs, rebalance_freq, weight_type = 'vw', sort_month = 7, ex_dividend = False, drop_na = True, breakpoint_exchanges = ['1'], **kwargs):

        # removes nans
        stocks = stocks[(stocks.me > 0) & (stocks.wt > 0)]

        stocks.date = pd.to_datetime(stocks.date)

        if(rebalance_freq == 'A'):
            rebalance_df = stocks[stocks.month == sort_month]
        else:
            rebalance_df = stocks

        breakpoint_stocks_df = rebalance_df[rebalance_df.exchcd.isin(breakpoint_exchanges)]

        # calculate breakpoints
        breakpoints_df = self.breakpoint_ts(breakpoint_stocks_df, vars = char_bkpts)

        # merge breakpoints to the rebalance df
        rebalance_df = breakpoints_df.merge(rebalance_df, how = 'inner', on = ['date'])

        ret_typ = 'adjretx' if(ex_dividend) else 'adjret'

        rank_cols = []
        for char, func in sorting_funcs.items():
            rank_cols.append(f'{char}_rank')
            rebalance_df[f'{char}_rank'] = rebalance_df.apply(func, args = (char, ), axis = 1)

        for rank_col in rank_cols:
            if('--fail' in rebalance_df[rank_col].unique()):
                cprint.warn(f'There are stocks that could not be sorted in {rank_col}. They will be removed before constructing portfolios.')
                rebalance_df = rebalance_df[rebalance_df[rank_col] != '--fail']
 
        rebalance_df['port_name'] = rebalance_df[rank_cols].agg('_'.join, axis = 1)

        if(rebalance_freq == 'A'):
            fin = stocks.merge(rebalance_df[['permno', 'ffyear', 'port_name']], how = 'left', on = ['permno', 'ffyear'])
        else:
            fin = rebalance_df
        
        fin = fin.dropna(subset = ['port_name'])
        rets = None
        if(weight_type == 'vw'):
            rets = fin.groupby(['date', 'port_name']).apply(py_functions.wavg, ret_typ, 'wt').to_frame().reset_index().rename(columns = {0: ret_typ})
        else:
            rets = fin.groupby(['date', 'port_name']).mean(numeric_only = True)[ret_typ].to_frame().reset_index().rename(columns = {0: ret_typ})
        firm = fin.groupby(['date', 'port_name'])['permno'].count().reset_index().rename(columns = {'permno': 'num_firms'})

        rets = rets.pivot(index = 'date', columns = 'port_name', values = ret_typ)
        firm = firm.pivot(index = 'date', columns = 'port_name', values = 'num_firms')
        firm = firm.add_suffix('_num_firms')

        res = rets.merge(firm, how = 'inner', on = ['date'])
        res = res.reset_index()
        if(drop_na): res = res.dropna()

        return(res)

Classes

class FamaFrench (wrds_username: str, db_path: pathlib.Path)
Expand source code
class FamaFrench:

    # TODO (1) Error Checking
    #      (2) Industry Sorts

    def __init__(self, wrds_username: str, db_path: pathlib.Path):
        self.DB = QueryWRDS.QueryWRDS(wrds_username, local_db_path = db_path)

    # Helper Function
    def _safe_append(self, obj, chk, ins):
        if(not chk in obj):
            obj[chk] = [ins]
        else:
            obj[chk].append(ins)
        return(obj)

    # Helper Function
    def _portfolio_return(self, df, ret_type, weight_type, name):
        if(weight_type == 'vw'):
                mkt_s = df.groupby('date').apply(py_functions.wavg, ret_type, 'me')
        else:
            mkt_s = df.groupby('date').mean(numeric_only = True)[ret_type]
        mkt_s.name = name
        mkt_s = mkt_s.to_frame().reset_index()
        return(mkt_s)

    '''
    Valid Factors: MKT, RF, MKT_RF, SMB3, SMB5, HML, RMW, CMA, MOM, ST_REV, LT_REV 
    '''
    # TODO: Check for none dates
    def FF_factors(self, factors: list[str], 
                   dfin = None, 
                   start_date: datetime.datetime = None, 
                   end_date: datetime.datetime = None, 
                   weight_type: str = 'vw', 
                   ret_type: str = 'adjret', 
                   drop_na: bool = True
        ) -> pd.DataFrame:
        """Creates standard Fama-French factors
        
        Creates the Fama-French factors using the original accounting practices from
        Eugene Fama's and Kenneth French's original 1992 paper. 
        The Cross-Section of Expected Stock Returns https://doi.org/10.1111/j.1540-6261.1992.tb04398.x
        
        Constructable factors include: 'MKT' market return, 'RF' risk free rate, 'MKT_RF' equity premium, 
        'SMB3' 3 factor small minus big, 'SMB5' 5 factor small minus big, 'HML' high minus low, 
        'RMW' robust minus weak, 'CMA' conservative minus aggresive, 'MOM' momentum, 
        'ST_REV' short term reversal, 'LT_Rev' long term reversal. See https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html
        for constrution notes.
        
        Args:
            factors: list of factors
            dfin: datadrame with assets to use (optional)
            start_date: start date for factors (optional)
            end_date: end date for factors (optional)
            weight_type: weights used to calculate returns
            ret_type: return type with or without dividends
            drop_na: if true drop rows that have NaN values

        Returns: 
            A dataframe with the specifed factros and a date column. Dataframe sorted by date.

        Example:
            Create the original 3 factor Fama-French model between 'date1' and 'date2'

            df = FamaFrench.FF_factors(
                factors = ['MKT_RF', 'SMB3', 'HML'], 
                start_date = date1, 
                end_date = date2
            )

        TODO:
            Error checking
        """
        
        # query DB if no dataframe supplied
        if(dfin is None):
            ccm_df = self.DB.query_CCM(start_date, end_date)
        else:
            ccm_df = dfin 
            
        # create resulting dataframe
        res = pd.DataFrame()
        date_s = ccm_df.date.unique()
        res['date'] = date_s
        res = res.sort_values(by = ['date'])

        # extract state and end date from dataframe
        if(not dfin is None):
            start_date = np.min(res.date)
            end_date = np.max(res.date)

        # calculate the market return of supplied assets
        if('MKT' in factors):
            mkt_df = self._portfolio_return(ccm_df, ret_type, weight_type, 'MKT')
            res = res.merge(mkt_df, how = 'left', on = ['date'])

        # add the risk free rate
        if('RF' in factors):
            rf_df = self.DB.query_riskfree(start_date, end_date, 'M')
            rf_df = rf_df.rename(columns = {'rf': 'RF'}) # just for naming consistency
            res = res.merge(rf_df, on = ['date'], how = 'left')

        # add the market premium
        if('MKT_RF' in factors):
            if('MKT' in factors and 'RF' in factors):
                res['MKT_RF'] = res.MKT - res.RF
            elif('MKT' in factors and not 'RF' in factors):
                rf_df = self.DB.query_riskfree(start_date, end_date, 'M')
                rf_df = rf_df.rename(columns = {'rf': 'RF'}) # just for naming consistency
                res = res.merge(rf_df, on = ['date'], how = 'left')
                res['MKT_RF'] = res.MKT - res.RF
                res = res.drop(columns = ['MKT', 'RF'])
            elif(not 'MKT' in factors and 'RF' in factors):
                mkt_df = self._portfolio_return(ccm_df, ret_type, weight_type, 'MKT')
                res = res.merge(mkt_df, how = 'left', on = ['date'])
                res['MKT_RF'] = res.MKT - res.RF
                res = res.drop(columns = ['MKT', 'RF'])
            else:
                mkt_df = self._portfolio_return(ccm_df, ret_type, weight_type, 'MKT')
                res = res.merge(mkt_df, how = 'left', on = ['date'])
                rf_df = self.DB.query_riskfree(start_date, end_date, 'M')
                rf_df = rf_df.rename(columns = {'rf': 'RF'}) # just for naming consistency
                res = res.merge(rf_df, on = ['date'], how = 'left')
                res['MKT_RF'] = res.MKT - res.RF
                res = res.drop(columns = ['MKT', 'RF'])

        # SMB factor from the 3-factor Fama-French model
        if('SMB3' in factors):
            # portfolio sorts on ME and BM
            sorts_df = self.sort_portfolios(
                stocks = ccm_df, char_bkpts = {'me': [0.5], 'ffbm': [0.3, 0.7]},
                sorting_funcs = {'me': self.sort_50, 'ffbm': self.sort_3070},
                drop_na = False, rebalance_freq = 'A'
            )
            sorts_df['SMB3'] = sorts_df[['me1_ffbm1', 'me1_ffbm2', 'me1_ffbm3']].mean(axis = 1) - sorts_df[['me2_ffbm1', 'me2_ffbm2', 'me2_ffbm3']].mean(axis = 1)
            res = res.merge(sorts_df[['date', 'SMB3']], how = 'left', on = ['date'])

        # SMB factor from the 5-factor Fama-French model
        if('SMB5' in factors):
            # sorts on BM
            sortsBM_df = self.sort_portfolios(
                stocks = ccm_df, char_bkpts = {'me': [0.5], 'ffbm': [0.3, 0.7]}, 
                sorting_funcs = {'me': self.sort_50, 'ffbm': self.sort_3070}, 
                drop_na = False, rebalance_freq = 'A'
            )

            # sorts on OP
            sortsOP_df = self.sort_portfolios(
                stocks = ccm_df, char_bkpts = {'me': [0.5], 'op': [0.3, 0.7]}, 
                sorting_funcs = {'me': self.sort_50, 'op': self.sort_3070}, 
                drop_na = False, rebalance_freq = 'A'
            )

            # sorts on INV
            sortsINV_df = self.sort_portfolios(
                stocks = ccm_df, char_bkpts = {'me': [0.5], 'inv': [0.3, 0.7]}, 
                sorting_funcs = {'me': self.sort_50, 'inv': self.sort_3070}, 
                drop_na = False, rebalance_freq = 'A'
            )

            # combine sorts into one dataframe
            sortsBM_df = sortsBM_df.merge(sortsOP_df, how = 'left', on = ['date'])
            sortsBM_df = sortsBM_df.merge(sortsINV_df, how = 'left', on = ['date'])

            # housekeeping
            sortsBM_df = sortsBM_df.set_index('date')
            sortsBM_df = sortsBM_df.dropna(how = 'all')

            # create factors
            sortsBM_df['SMB_BM'] = sortsBM_df[['me1_ffbm1', 'me1_ffbm2', 'me1_ffbm3']].mean(axis = 1) - sortsBM_df[['me2_ffbm1', 'me2_ffbm2', 'me2_ffbm3']].mean(axis = 1)
            sortsBM_df['SMB_OP'] = sortsOP_df[['me1_op1', 'me1_op2', 'me1_op3']].mean(axis = 1) - sortsOP_df[['me2_op1', 'me2_op2', 'me2_op3']].mean(axis = 1)
            sortsBM_df['SMB_INV'] = sortsINV_df[['me1_inv1', 'me1_inv2', 'me1_inv3']].mean(axis = 1) - sortsINV_df[['me2_inv1', 'me2_inv2', 'me2_inv3']].mean(axis = 1)

            # average factors
            sortsBM_df['SMB5'] = sortsBM_df[['SMB_BM', 'SMB_OP', 'SMB_INV']].mean(axis = 1)

            # add to result dataframe
            sortsBM_df = sortsBM_df.reset_index()
            res = res.merge(sortsBM_df[['date', 'SMB5']], how = 'left', on = ['date'])

        if('HML' in factors):
            sortsBM_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'ffbm': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'ffbm': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsBM_df['HML'] = (1/2) * (sortsBM_df.me1_ffbm3 + sortsBM_df.me2_ffbm3) - (1/2) * (sortsBM_df.me1_ffbm1 + sortsBM_df.me2_ffbm1)
            res = res.merge(sortsBM_df[['date', 'HML']], how = 'left', on = ['date'])

        if('RMW' in factors):
            sortsOP_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'op': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'op': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsOP_df['RMW'] = (1/2) * (sortsOP_df.me1_op3 + sortsOP_df.me2_op3) - (1/2) * (sortsOP_df.me1_op1 + sortsOP_df.me2_op1)
            res = res.merge(sortsOP_df[['date', 'RMW']], how = 'left', on = ['date'])

        if('CMA' in factors):
            sortsINV_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'inv': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'inv': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsINV_df['CMA'] = (1/2) * (sortsINV_df.me1_inv1 + sortsINV_df.me2_inv1) - (1/2) * (sortsINV_df.me1_inv3 + sortsINV_df.me2_inv3)
            res = res.merge(sortsINV_df[['date', 'CMA']], how = 'left', on = ['date'])

        if('MOM' in factors):
            sortsPR2_12_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'pr2_12': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'pr2_12': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsPR2_12_df['MOM'] = (1/2) * (sortsPR2_12_df.me1_pr2_123 + sortsPR2_12_df.me2_pr2_123) - (1/2) * (sortsPR2_12_df.me1_pr2_121 + sortsPR2_12_df.me2_pr2_121)
            res = res.merge(sortsPR2_12_df[['date', 'MOM']], how = 'left', on = ['date'])

        if('ST_REV' in factors):
            sortsPR1_1_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'pr1_1': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'pr1_1': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsPR1_1_df['ST_REV'] = (1/2) * (sortsPR1_1_df.me1_pr1_11 + sortsPR1_1_df.me2_pr1_11) - (1/2) * (sortsPR1_1_df.me1_pr1_13 + sortsPR1_1_df.me2_pr1_13)
            res = res.merge(sortsPR1_1_df[['date', 'ST_REV']], how = 'left', on = ['date'])

        if('LT_REV' in factors):
            sortsPR13_60_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'pr13_60': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'pr13_60': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
            sortsPR13_60_df['LT_REV'] = (1/2) * (sortsPR13_60_df.me1_pr13_601 + sortsPR13_60_df.me2_pr13_601) - (1/2) * (sortsPR13_60_df.me1_pr13_603 + sortsPR13_60_df.me2_pr13_603)
            res = res.merge(sortsPR13_60_df[['date', 'LT_REV']], how = 'left', on = ['date'])

        res = res.set_index('date').sort_index()
        if(drop_na): res = res.dropna(how = 'all')
        return(res)



    def FF_3factor(self, start_date: datetime.datetime = None, end_date: datetime.datetime = None, weigth_type: str = 'vw', ret_type: str = 'adjret', drop_na: bool = True, dfin = None):
        return(self.FF_factors(factors = ['MKT_RF', 'SMB3', 'HML'], 
                               dfin = dfin,
                               start_date = start_date, end_date = end_date, 
                               weight_type = weigth_type, ret_type = ret_type, 
                               drop_na = drop_na))

    def FF_5factor(self, start_date: datetime.datetime = None, end_date: datetime.datetime = None, weigth_type: str = 'vw', ret_type: str = 'adjret', drop_na: bool = True, dfin = None):
        return(self.FF_factors(factors = ['MKT_RF', 'SMB5', 'HML', 'CMA', 'RMW'], 
                               dfin = dfin,
                               start_date = start_date, end_date = end_date, 
                               weight_type = weigth_type, ret_type = ret_type, 
                               drop_na = drop_na))
    
    def breakpoint_ts(self, df_in, vars, qtiles = None):
        
        DEFAULT_QTILES = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
        DECILES_QTILES = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        QUINTIL_QTILES = [0.2, 0.4, 0.6, 0.8]

        dict_in = {}
        if(type(vars) is dict):
            dict_in = vars
        else:
            if(type(qtiles) is int or qtiles is None):
                for var in vars:
                    if(qtiles == 5):
                        dict_in[var] = QUINTIL_QTILES
                    elif(qtiles == 10):
                        dict_in[var] = DECILES_QTILES
                    else:
                        dict_in[var] = DEFAULT_QTILES
            elif(type(qtiles) is list):
                for var in vars:
                    dict_in[var] = qtiles
            else:
                raise TypeError("No valid vars or qtile combination given.")

        res = []
        for var, qtiles in dict_in.items():
            temp = df_in.groupby('date')[var].describe(percentiles = qtiles)
            ptiles = [f'{int(100 * q)}%' for q in qtiles]
            temp = temp[ptiles]
            temp = temp.add_prefix(f'{var}_')
            res.append(temp)

        fin = functools.reduce(lambda x, y: pd.merge(x, y, on = 'date'), res)
        fin = fin.reset_index()
        return(fin)
    

    # sorting functions
    def sort_50(self, row, var):
        if(row[var] < row[f'{var}_50%']):
            res = f'{var}1'
        elif(row[var] >= row[f'{var}_50%']):
            res = f'{var}2'
        else:
            res = '--fail'
        return(res)
    
    def sort_050(self, row, var):
        if(row[var] < 0):
            res = f'{var}1'
        if(row[var] >= 0 and row[var] < row[f'{var}_50%']):
            res = f'{var}2'
        elif(row[var] >= row[f'{var}_50%']):
            res = f'{var}3'
        else:
            res = '--fail'
        return(res)
    
    def sort_3070(self, row, var):
        if(row[var] < row[f'{var}_30%']):
            res = f'{var}1'
        elif(row[var] >= row[f'{var}_30%'] and row[var] < row[f'{var}_70%']):
            res = f'{var}2'
        elif(row[var] >= row[f'{var}_70%']):
            res = f'{var}3'
        else:
            res = '--fail'
        return(res)
    
    def sort_03070(self, row, var):
        if(row[var] <= 0):
            res = f'{var}1'
        elif(row[var] >= 0 and row[var] < row[f'{var}_30%']):
            res = f'{var}2'
        elif(row[var] >= row[f'{var}_30%'] and row[var] < row[f'{var}_70%']):
            res = f'{var}3'
        elif(row[var] >= row[f'{var}_70%']):
            res = f'{var}4'
        else:
            res = '--fail'
        return(res)
    
    def sort_quintile(self, row, var):
        if(row[var] < row[f'{var}_20%']):
            res = f'{var}1'
        elif(row[var] >= row[f'{var}_20%'] and row[var] < row[f'{var}_40%']):
            res = f'{var}2'
        elif(row[var] >= row[f'{var}_40%'] and row[var] < row[f'{var}_60%']):
            res = f'{var}3'
        elif(row[var] >= row[f'{var}_60%'] and row[var] < row[f'{var}_80%']):
            res = f'{var}4'
        elif(row[var] >= row[f'{var}_80%']):
            res = f'{var}5'
        else:
            res = '--fail'
        return(res)
    
    def sort_deciles(self, row, var):
        if(row[var] < row[f'{var}_10%']):
            res = f'{var}1'
        elif(row[var] >= row[f'{var}_10%'] and row[var] < row[f'{var}_20%']):
            res = f'{var}2'
        elif(row[var] >= row[f'{var}_20%'] and row[var] < row[f'{var}_30%']):
            res = f'{var}3'
        elif(row[var] >= row[f'{var}_30%'] and row[var] < row[f'{var}_40%']):
            res = f'{var}4'
        elif(row[var] >= row[f'{var}_40%'] and row[var] < row[f'{var}_50%']):
            res = f'{var}5'
        elif(row[var] >= row[f'{var}_50%'] and row[var] < row[f'{var}_60%']):
            res = f'{var}6'
        elif(row[var] >= row[f'{var}_60%'] and row[var] < row[f'{var}_70%']):
            res = f'{var}7'
        elif(row[var] >= row[f'{var}_70%'] and row[var] < row[f'{var}_80%']):
            res = f'{var}8'
        elif(row[var] >= row[f'{var}_80%'] and row[var] < row[f'{var}_90%']):
            res = f'{var}9'
        elif(row[var] >= row[f'{var}_90%']):
            res = f'{var}10'
        else:
            res = '--fail'
        return(res)

    def sort_portfolios(self, stocks, char_bkpts, sorting_funcs, rebalance_freq, weight_type = 'vw', sort_month = 7, ex_dividend = False, drop_na = True, breakpoint_exchanges = ['1'], **kwargs):

        # removes nans
        stocks = stocks[(stocks.me > 0) & (stocks.wt > 0)]

        stocks.date = pd.to_datetime(stocks.date)

        if(rebalance_freq == 'A'):
            rebalance_df = stocks[stocks.month == sort_month]
        else:
            rebalance_df = stocks

        breakpoint_stocks_df = rebalance_df[rebalance_df.exchcd.isin(breakpoint_exchanges)]

        # calculate breakpoints
        breakpoints_df = self.breakpoint_ts(breakpoint_stocks_df, vars = char_bkpts)

        # merge breakpoints to the rebalance df
        rebalance_df = breakpoints_df.merge(rebalance_df, how = 'inner', on = ['date'])

        ret_typ = 'adjretx' if(ex_dividend) else 'adjret'

        rank_cols = []
        for char, func in sorting_funcs.items():
            rank_cols.append(f'{char}_rank')
            rebalance_df[f'{char}_rank'] = rebalance_df.apply(func, args = (char, ), axis = 1)

        for rank_col in rank_cols:
            if('--fail' in rebalance_df[rank_col].unique()):
                cprint.warn(f'There are stocks that could not be sorted in {rank_col}. They will be removed before constructing portfolios.')
                rebalance_df = rebalance_df[rebalance_df[rank_col] != '--fail']
 
        rebalance_df['port_name'] = rebalance_df[rank_cols].agg('_'.join, axis = 1)

        if(rebalance_freq == 'A'):
            fin = stocks.merge(rebalance_df[['permno', 'ffyear', 'port_name']], how = 'left', on = ['permno', 'ffyear'])
        else:
            fin = rebalance_df
        
        fin = fin.dropna(subset = ['port_name'])
        rets = None
        if(weight_type == 'vw'):
            rets = fin.groupby(['date', 'port_name']).apply(py_functions.wavg, ret_typ, 'wt').to_frame().reset_index().rename(columns = {0: ret_typ})
        else:
            rets = fin.groupby(['date', 'port_name']).mean(numeric_only = True)[ret_typ].to_frame().reset_index().rename(columns = {0: ret_typ})
        firm = fin.groupby(['date', 'port_name'])['permno'].count().reset_index().rename(columns = {'permno': 'num_firms'})

        rets = rets.pivot(index = 'date', columns = 'port_name', values = ret_typ)
        firm = firm.pivot(index = 'date', columns = 'port_name', values = 'num_firms')
        firm = firm.add_suffix('_num_firms')

        res = rets.merge(firm, how = 'inner', on = ['date'])
        res = res.reset_index()
        if(drop_na): res = res.dropna()

        return(res)

Methods

def FF_3factor(self, start_date: datetime.datetime = None, end_date: datetime.datetime = None, weigth_type: str = 'vw', ret_type: str = 'adjret', drop_na: bool = True, dfin=None)
Expand source code
def FF_3factor(self, start_date: datetime.datetime = None, end_date: datetime.datetime = None, weigth_type: str = 'vw', ret_type: str = 'adjret', drop_na: bool = True, dfin = None):
    return(self.FF_factors(factors = ['MKT_RF', 'SMB3', 'HML'], 
                           dfin = dfin,
                           start_date = start_date, end_date = end_date, 
                           weight_type = weigth_type, ret_type = ret_type, 
                           drop_na = drop_na))
def FF_5factor(self, start_date: datetime.datetime = None, end_date: datetime.datetime = None, weigth_type: str = 'vw', ret_type: str = 'adjret', drop_na: bool = True, dfin=None)
Expand source code
def FF_5factor(self, start_date: datetime.datetime = None, end_date: datetime.datetime = None, weigth_type: str = 'vw', ret_type: str = 'adjret', drop_na: bool = True, dfin = None):
    return(self.FF_factors(factors = ['MKT_RF', 'SMB5', 'HML', 'CMA', 'RMW'], 
                           dfin = dfin,
                           start_date = start_date, end_date = end_date, 
                           weight_type = weigth_type, ret_type = ret_type, 
                           drop_na = drop_na))
def FF_factors(self, factors: list[str], dfin=None, start_date: datetime.datetime = None, end_date: datetime.datetime = None, weight_type: str = 'vw', ret_type: str = 'adjret', drop_na: bool = True) ‑> pandas.core.frame.DataFrame

Creates standard Fama-French factors

Creates the Fama-French factors using the original accounting practices from Eugene Fama's and Kenneth French's original 1992 paper. The Cross-Section of Expected Stock Returns https://doi.org/10.1111/j.1540-6261.1992.tb04398.x

Constructable factors include: 'MKT' market return, 'RF' risk free rate, 'MKT_RF' equity premium, 'SMB3' 3 factor small minus big, 'SMB5' 5 factor small minus big, 'HML' high minus low, 'RMW' robust minus weak, 'CMA' conservative minus aggresive, 'MOM' momentum, 'ST_REV' short term reversal, 'LT_Rev' long term reversal. See https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html for constrution notes.

Args

factors
list of factors
dfin
datadrame with assets to use (optional)
start_date
start date for factors (optional)
end_date
end date for factors (optional)
weight_type
weights used to calculate returns
ret_type
return type with or without dividends
drop_na
if true drop rows that have NaN values

Returns: A dataframe with the specifed factros and a date column. Dataframe sorted by date.

Example

Create the original 3 factor Fama-French model between 'date1' and 'date2'

df = FamaFrench.FF_factors( factors = ['MKT_RF', 'SMB3', 'HML'], start_date = date1, end_date = date2 )

Todo

Error checking

Expand source code
def FF_factors(self, factors: list[str], 
               dfin = None, 
               start_date: datetime.datetime = None, 
               end_date: datetime.datetime = None, 
               weight_type: str = 'vw', 
               ret_type: str = 'adjret', 
               drop_na: bool = True
    ) -> pd.DataFrame:
    """Creates standard Fama-French factors
    
    Creates the Fama-French factors using the original accounting practices from
    Eugene Fama's and Kenneth French's original 1992 paper. 
    The Cross-Section of Expected Stock Returns https://doi.org/10.1111/j.1540-6261.1992.tb04398.x
    
    Constructable factors include: 'MKT' market return, 'RF' risk free rate, 'MKT_RF' equity premium, 
    'SMB3' 3 factor small minus big, 'SMB5' 5 factor small minus big, 'HML' high minus low, 
    'RMW' robust minus weak, 'CMA' conservative minus aggresive, 'MOM' momentum, 
    'ST_REV' short term reversal, 'LT_Rev' long term reversal. See https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html
    for constrution notes.
    
    Args:
        factors: list of factors
        dfin: datadrame with assets to use (optional)
        start_date: start date for factors (optional)
        end_date: end date for factors (optional)
        weight_type: weights used to calculate returns
        ret_type: return type with or without dividends
        drop_na: if true drop rows that have NaN values

    Returns: 
        A dataframe with the specifed factros and a date column. Dataframe sorted by date.

    Example:
        Create the original 3 factor Fama-French model between 'date1' and 'date2'

        df = FamaFrench.FF_factors(
            factors = ['MKT_RF', 'SMB3', 'HML'], 
            start_date = date1, 
            end_date = date2
        )

    TODO:
        Error checking
    """
    
    # query DB if no dataframe supplied
    if(dfin is None):
        ccm_df = self.DB.query_CCM(start_date, end_date)
    else:
        ccm_df = dfin 
        
    # create resulting dataframe
    res = pd.DataFrame()
    date_s = ccm_df.date.unique()
    res['date'] = date_s
    res = res.sort_values(by = ['date'])

    # extract state and end date from dataframe
    if(not dfin is None):
        start_date = np.min(res.date)
        end_date = np.max(res.date)

    # calculate the market return of supplied assets
    if('MKT' in factors):
        mkt_df = self._portfolio_return(ccm_df, ret_type, weight_type, 'MKT')
        res = res.merge(mkt_df, how = 'left', on = ['date'])

    # add the risk free rate
    if('RF' in factors):
        rf_df = self.DB.query_riskfree(start_date, end_date, 'M')
        rf_df = rf_df.rename(columns = {'rf': 'RF'}) # just for naming consistency
        res = res.merge(rf_df, on = ['date'], how = 'left')

    # add the market premium
    if('MKT_RF' in factors):
        if('MKT' in factors and 'RF' in factors):
            res['MKT_RF'] = res.MKT - res.RF
        elif('MKT' in factors and not 'RF' in factors):
            rf_df = self.DB.query_riskfree(start_date, end_date, 'M')
            rf_df = rf_df.rename(columns = {'rf': 'RF'}) # just for naming consistency
            res = res.merge(rf_df, on = ['date'], how = 'left')
            res['MKT_RF'] = res.MKT - res.RF
            res = res.drop(columns = ['MKT', 'RF'])
        elif(not 'MKT' in factors and 'RF' in factors):
            mkt_df = self._portfolio_return(ccm_df, ret_type, weight_type, 'MKT')
            res = res.merge(mkt_df, how = 'left', on = ['date'])
            res['MKT_RF'] = res.MKT - res.RF
            res = res.drop(columns = ['MKT', 'RF'])
        else:
            mkt_df = self._portfolio_return(ccm_df, ret_type, weight_type, 'MKT')
            res = res.merge(mkt_df, how = 'left', on = ['date'])
            rf_df = self.DB.query_riskfree(start_date, end_date, 'M')
            rf_df = rf_df.rename(columns = {'rf': 'RF'}) # just for naming consistency
            res = res.merge(rf_df, on = ['date'], how = 'left')
            res['MKT_RF'] = res.MKT - res.RF
            res = res.drop(columns = ['MKT', 'RF'])

    # SMB factor from the 3-factor Fama-French model
    if('SMB3' in factors):
        # portfolio sorts on ME and BM
        sorts_df = self.sort_portfolios(
            stocks = ccm_df, char_bkpts = {'me': [0.5], 'ffbm': [0.3, 0.7]},
            sorting_funcs = {'me': self.sort_50, 'ffbm': self.sort_3070},
            drop_na = False, rebalance_freq = 'A'
        )
        sorts_df['SMB3'] = sorts_df[['me1_ffbm1', 'me1_ffbm2', 'me1_ffbm3']].mean(axis = 1) - sorts_df[['me2_ffbm1', 'me2_ffbm2', 'me2_ffbm3']].mean(axis = 1)
        res = res.merge(sorts_df[['date', 'SMB3']], how = 'left', on = ['date'])

    # SMB factor from the 5-factor Fama-French model
    if('SMB5' in factors):
        # sorts on BM
        sortsBM_df = self.sort_portfolios(
            stocks = ccm_df, char_bkpts = {'me': [0.5], 'ffbm': [0.3, 0.7]}, 
            sorting_funcs = {'me': self.sort_50, 'ffbm': self.sort_3070}, 
            drop_na = False, rebalance_freq = 'A'
        )

        # sorts on OP
        sortsOP_df = self.sort_portfolios(
            stocks = ccm_df, char_bkpts = {'me': [0.5], 'op': [0.3, 0.7]}, 
            sorting_funcs = {'me': self.sort_50, 'op': self.sort_3070}, 
            drop_na = False, rebalance_freq = 'A'
        )

        # sorts on INV
        sortsINV_df = self.sort_portfolios(
            stocks = ccm_df, char_bkpts = {'me': [0.5], 'inv': [0.3, 0.7]}, 
            sorting_funcs = {'me': self.sort_50, 'inv': self.sort_3070}, 
            drop_na = False, rebalance_freq = 'A'
        )

        # combine sorts into one dataframe
        sortsBM_df = sortsBM_df.merge(sortsOP_df, how = 'left', on = ['date'])
        sortsBM_df = sortsBM_df.merge(sortsINV_df, how = 'left', on = ['date'])

        # housekeeping
        sortsBM_df = sortsBM_df.set_index('date')
        sortsBM_df = sortsBM_df.dropna(how = 'all')

        # create factors
        sortsBM_df['SMB_BM'] = sortsBM_df[['me1_ffbm1', 'me1_ffbm2', 'me1_ffbm3']].mean(axis = 1) - sortsBM_df[['me2_ffbm1', 'me2_ffbm2', 'me2_ffbm3']].mean(axis = 1)
        sortsBM_df['SMB_OP'] = sortsOP_df[['me1_op1', 'me1_op2', 'me1_op3']].mean(axis = 1) - sortsOP_df[['me2_op1', 'me2_op2', 'me2_op3']].mean(axis = 1)
        sortsBM_df['SMB_INV'] = sortsINV_df[['me1_inv1', 'me1_inv2', 'me1_inv3']].mean(axis = 1) - sortsINV_df[['me2_inv1', 'me2_inv2', 'me2_inv3']].mean(axis = 1)

        # average factors
        sortsBM_df['SMB5'] = sortsBM_df[['SMB_BM', 'SMB_OP', 'SMB_INV']].mean(axis = 1)

        # add to result dataframe
        sortsBM_df = sortsBM_df.reset_index()
        res = res.merge(sortsBM_df[['date', 'SMB5']], how = 'left', on = ['date'])

    if('HML' in factors):
        sortsBM_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'ffbm': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'ffbm': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
        sortsBM_df['HML'] = (1/2) * (sortsBM_df.me1_ffbm3 + sortsBM_df.me2_ffbm3) - (1/2) * (sortsBM_df.me1_ffbm1 + sortsBM_df.me2_ffbm1)
        res = res.merge(sortsBM_df[['date', 'HML']], how = 'left', on = ['date'])

    if('RMW' in factors):
        sortsOP_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'op': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'op': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
        sortsOP_df['RMW'] = (1/2) * (sortsOP_df.me1_op3 + sortsOP_df.me2_op3) - (1/2) * (sortsOP_df.me1_op1 + sortsOP_df.me2_op1)
        res = res.merge(sortsOP_df[['date', 'RMW']], how = 'left', on = ['date'])

    if('CMA' in factors):
        sortsINV_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'inv': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'inv': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
        sortsINV_df['CMA'] = (1/2) * (sortsINV_df.me1_inv1 + sortsINV_df.me2_inv1) - (1/2) * (sortsINV_df.me1_inv3 + sortsINV_df.me2_inv3)
        res = res.merge(sortsINV_df[['date', 'CMA']], how = 'left', on = ['date'])

    if('MOM' in factors):
        sortsPR2_12_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'pr2_12': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'pr2_12': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
        sortsPR2_12_df['MOM'] = (1/2) * (sortsPR2_12_df.me1_pr2_123 + sortsPR2_12_df.me2_pr2_123) - (1/2) * (sortsPR2_12_df.me1_pr2_121 + sortsPR2_12_df.me2_pr2_121)
        res = res.merge(sortsPR2_12_df[['date', 'MOM']], how = 'left', on = ['date'])

    if('ST_REV' in factors):
        sortsPR1_1_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'pr1_1': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'pr1_1': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
        sortsPR1_1_df['ST_REV'] = (1/2) * (sortsPR1_1_df.me1_pr1_11 + sortsPR1_1_df.me2_pr1_11) - (1/2) * (sortsPR1_1_df.me1_pr1_13 + sortsPR1_1_df.me2_pr1_13)
        res = res.merge(sortsPR1_1_df[['date', 'ST_REV']], how = 'left', on = ['date'])

    if('LT_REV' in factors):
        sortsPR13_60_df = self.sort_portfolios(stocks = ccm_df, char_bkpts = {'me': [0.5], 'pr13_60': [0.3, 0.7]}, sorting_funcs = {'me': self.sort_50, 'pr13_60': self.sort_3070}, drop_na = False, rebalance_freq = 'A')
        sortsPR13_60_df['LT_REV'] = (1/2) * (sortsPR13_60_df.me1_pr13_601 + sortsPR13_60_df.me2_pr13_601) - (1/2) * (sortsPR13_60_df.me1_pr13_603 + sortsPR13_60_df.me2_pr13_603)
        res = res.merge(sortsPR13_60_df[['date', 'LT_REV']], how = 'left', on = ['date'])

    res = res.set_index('date').sort_index()
    if(drop_na): res = res.dropna(how = 'all')
    return(res)
def breakpoint_ts(self, df_in, vars, qtiles=None)
Expand source code
def breakpoint_ts(self, df_in, vars, qtiles = None):
    
    DEFAULT_QTILES = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    DECILES_QTILES = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    QUINTIL_QTILES = [0.2, 0.4, 0.6, 0.8]

    dict_in = {}
    if(type(vars) is dict):
        dict_in = vars
    else:
        if(type(qtiles) is int or qtiles is None):
            for var in vars:
                if(qtiles == 5):
                    dict_in[var] = QUINTIL_QTILES
                elif(qtiles == 10):
                    dict_in[var] = DECILES_QTILES
                else:
                    dict_in[var] = DEFAULT_QTILES
        elif(type(qtiles) is list):
            for var in vars:
                dict_in[var] = qtiles
        else:
            raise TypeError("No valid vars or qtile combination given.")

    res = []
    for var, qtiles in dict_in.items():
        temp = df_in.groupby('date')[var].describe(percentiles = qtiles)
        ptiles = [f'{int(100 * q)}%' for q in qtiles]
        temp = temp[ptiles]
        temp = temp.add_prefix(f'{var}_')
        res.append(temp)

    fin = functools.reduce(lambda x, y: pd.merge(x, y, on = 'date'), res)
    fin = fin.reset_index()
    return(fin)
def sort_03070(self, row, var)
Expand source code
def sort_03070(self, row, var):
    if(row[var] <= 0):
        res = f'{var}1'
    elif(row[var] >= 0 and row[var] < row[f'{var}_30%']):
        res = f'{var}2'
    elif(row[var] >= row[f'{var}_30%'] and row[var] < row[f'{var}_70%']):
        res = f'{var}3'
    elif(row[var] >= row[f'{var}_70%']):
        res = f'{var}4'
    else:
        res = '--fail'
    return(res)
def sort_050(self, row, var)
Expand source code
def sort_050(self, row, var):
    if(row[var] < 0):
        res = f'{var}1'
    if(row[var] >= 0 and row[var] < row[f'{var}_50%']):
        res = f'{var}2'
    elif(row[var] >= row[f'{var}_50%']):
        res = f'{var}3'
    else:
        res = '--fail'
    return(res)
def sort_3070(self, row, var)
Expand source code
def sort_3070(self, row, var):
    if(row[var] < row[f'{var}_30%']):
        res = f'{var}1'
    elif(row[var] >= row[f'{var}_30%'] and row[var] < row[f'{var}_70%']):
        res = f'{var}2'
    elif(row[var] >= row[f'{var}_70%']):
        res = f'{var}3'
    else:
        res = '--fail'
    return(res)
def sort_50(self, row, var)
Expand source code
def sort_50(self, row, var):
    if(row[var] < row[f'{var}_50%']):
        res = f'{var}1'
    elif(row[var] >= row[f'{var}_50%']):
        res = f'{var}2'
    else:
        res = '--fail'
    return(res)
def sort_deciles(self, row, var)
Expand source code
def sort_deciles(self, row, var):
    if(row[var] < row[f'{var}_10%']):
        res = f'{var}1'
    elif(row[var] >= row[f'{var}_10%'] and row[var] < row[f'{var}_20%']):
        res = f'{var}2'
    elif(row[var] >= row[f'{var}_20%'] and row[var] < row[f'{var}_30%']):
        res = f'{var}3'
    elif(row[var] >= row[f'{var}_30%'] and row[var] < row[f'{var}_40%']):
        res = f'{var}4'
    elif(row[var] >= row[f'{var}_40%'] and row[var] < row[f'{var}_50%']):
        res = f'{var}5'
    elif(row[var] >= row[f'{var}_50%'] and row[var] < row[f'{var}_60%']):
        res = f'{var}6'
    elif(row[var] >= row[f'{var}_60%'] and row[var] < row[f'{var}_70%']):
        res = f'{var}7'
    elif(row[var] >= row[f'{var}_70%'] and row[var] < row[f'{var}_80%']):
        res = f'{var}8'
    elif(row[var] >= row[f'{var}_80%'] and row[var] < row[f'{var}_90%']):
        res = f'{var}9'
    elif(row[var] >= row[f'{var}_90%']):
        res = f'{var}10'
    else:
        res = '--fail'
    return(res)
def sort_portfolios(self, stocks, char_bkpts, sorting_funcs, rebalance_freq, weight_type='vw', sort_month=7, ex_dividend=False, drop_na=True, breakpoint_exchanges=['1'], **kwargs)
Expand source code
def sort_portfolios(self, stocks, char_bkpts, sorting_funcs, rebalance_freq, weight_type = 'vw', sort_month = 7, ex_dividend = False, drop_na = True, breakpoint_exchanges = ['1'], **kwargs):

    # removes nans
    stocks = stocks[(stocks.me > 0) & (stocks.wt > 0)]

    stocks.date = pd.to_datetime(stocks.date)

    if(rebalance_freq == 'A'):
        rebalance_df = stocks[stocks.month == sort_month]
    else:
        rebalance_df = stocks

    breakpoint_stocks_df = rebalance_df[rebalance_df.exchcd.isin(breakpoint_exchanges)]

    # calculate breakpoints
    breakpoints_df = self.breakpoint_ts(breakpoint_stocks_df, vars = char_bkpts)

    # merge breakpoints to the rebalance df
    rebalance_df = breakpoints_df.merge(rebalance_df, how = 'inner', on = ['date'])

    ret_typ = 'adjretx' if(ex_dividend) else 'adjret'

    rank_cols = []
    for char, func in sorting_funcs.items():
        rank_cols.append(f'{char}_rank')
        rebalance_df[f'{char}_rank'] = rebalance_df.apply(func, args = (char, ), axis = 1)

    for rank_col in rank_cols:
        if('--fail' in rebalance_df[rank_col].unique()):
            cprint.warn(f'There are stocks that could not be sorted in {rank_col}. They will be removed before constructing portfolios.')
            rebalance_df = rebalance_df[rebalance_df[rank_col] != '--fail']

    rebalance_df['port_name'] = rebalance_df[rank_cols].agg('_'.join, axis = 1)

    if(rebalance_freq == 'A'):
        fin = stocks.merge(rebalance_df[['permno', 'ffyear', 'port_name']], how = 'left', on = ['permno', 'ffyear'])
    else:
        fin = rebalance_df
    
    fin = fin.dropna(subset = ['port_name'])
    rets = None
    if(weight_type == 'vw'):
        rets = fin.groupby(['date', 'port_name']).apply(py_functions.wavg, ret_typ, 'wt').to_frame().reset_index().rename(columns = {0: ret_typ})
    else:
        rets = fin.groupby(['date', 'port_name']).mean(numeric_only = True)[ret_typ].to_frame().reset_index().rename(columns = {0: ret_typ})
    firm = fin.groupby(['date', 'port_name'])['permno'].count().reset_index().rename(columns = {'permno': 'num_firms'})

    rets = rets.pivot(index = 'date', columns = 'port_name', values = ret_typ)
    firm = firm.pivot(index = 'date', columns = 'port_name', values = 'num_firms')
    firm = firm.add_suffix('_num_firms')

    res = rets.merge(firm, how = 'inner', on = ['date'])
    res = res.reset_index()
    if(drop_na): res = res.dropna()

    return(res)
def sort_quintile(self, row, var)
Expand source code
def sort_quintile(self, row, var):
    if(row[var] < row[f'{var}_20%']):
        res = f'{var}1'
    elif(row[var] >= row[f'{var}_20%'] and row[var] < row[f'{var}_40%']):
        res = f'{var}2'
    elif(row[var] >= row[f'{var}_40%'] and row[var] < row[f'{var}_60%']):
        res = f'{var}3'
    elif(row[var] >= row[f'{var}_60%'] and row[var] < row[f'{var}_80%']):
        res = f'{var}4'
    elif(row[var] >= row[f'{var}_80%']):
        res = f'{var}5'
    else:
        res = '--fail'
    return(res)