import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from scipy import stats


df = pd.read_csv('data/all_transactions.csv')
# splitting up so dataframe is displayable in the final pdf :)
display(df.iloc[:, :8].head())
display(df.iloc[:, 8:].head())


df.dtypes

disclosure_year            int64
disclosure_date           object
transaction_date          object
owner                     object
ticker                    object
asset_description         object
type                      object
amount                    object
representative            object
district                  object
ptr_link                  object
cap_gains_over_200_usd      bool
dtype: object


# need to have errors = 'coerce', so some weird data gets set to np.nan
df['disclosure_date'] = pd.to_datetime(df['disclosure_date'], errors = 'coerce')
df['transaction_date'] = pd.to_datetime(df['transaction_date'], errors = 'coerce')
df.dtypes

disclosure_year                    int64
disclosure_date           datetime64[ns]
transaction_date          datetime64[ns]
owner                             object
ticker                            object
asset_description                 object
type                              object
amount                            object
representative                    object
district                          object
ptr_link                          object
cap_gains_over_200_usd              bool
dtype: object


# regex=False to get rid of regex depracated warning
df['representative'] = df['representative'].str.replace('Hon. ', '', regex=False)


df['state'] = df['district'].str[:2]
df[['state']].head()


# create a dictionary to map range values to their average value
mapped_values = {
    '$1,001 - $15,000':8_000.5,
    '$15,001 - $50,000':32_500.5,
    '$50,001 - $100,000':75_000.5,
    '$100,001 - $250,000':175_000.5,

    # place holder so we can calculate values by filling in average
    '$1,001 -':-99,

    '$250,001 - $500,000':375_000.5,
    '$500,001 - $1,000,000':375_000.5,
    '$1,000,001 - $5,000,000':3_000_000.5,

    # place holder so we can calculate values by filling in average
    '$1,000,000 +':-100,

    '$5,000,001 - $25,000,000':15_000_000.5,
    '$1,000 - $15,000':8_000,
    '$15,000 - $50,000':32_000,

    # we aren't giving 50,000,000+ a placeholder here, because we think
    # 50,000,000+ is already too much so let's just keep the average as is.
    '$50,000,000 +':50_000_000,
    '$1,000,000 - $5,000,000':3_000_000,
}


# replace range values with their average value
df['amount_cleaned'] = df['amount'].replace(mapped_values)


avg_above_1001 = round(df.loc[df['amount_cleaned'] >= 1_001]['amount_cleaned'].mean(), 2)
avg_above_1_mil = round(df.loc[df['amount_cleaned'] >= 1_000_000]['amount_cleaned'].mean(), 2)

avg_above_1001, avg_above_1_mil

(53267.76, 6163265.79)


# replace -99, which maps to '$1,001 -' in 'amounts' with avg_above_1001
df['amount_cleaned'] = df['amount_cleaned'].replace({-99: avg_above_1001})

# replace -100, which maps to '$1,000,000 +' in 'amounts' with avg_above_1_mil
df['amount_cleaned'] = df['amount_cleaned'].replace({-100: avg_above_1_mil})

# let's make sure the -99 and -100 values are gone!
(-99 or -100) in df['amount_cleaned'].value_counts()

False


# let's make sure our newly added column is of type float
df.dtypes[-1]

dtype('float64')


# assign a new column 'non_disclosure_period(days)' to be difference of 'disclosure_date' and 'transaction_date'
df['non_disclosure_period(days)'] = (df['disclosure_date'] - df['transaction_date']).dt.days
df[['non_disclosure_period(days)']].head()


df[df['non_disclosure_period(days)'] < 0].shape[0]

13


# first call .value_counts(), then .plot(), so graph is easier to read
df['disclosure_year'].value_counts().plot(kind = 'bar', \
    title = 'Year of Stock Transaction Disclosure', \
    xlabel = 'Year', ylabel = 'number of `stock transactions', figsize=(8,6));


df['ticker'].value_counts()[:20].plot(kind = 'barh', figsize=(10,6))
plt.title('20 Most Popular Stock Transactions by Ticker')
plt.ylabel('ticker')
plt.xlabel('number of transactions')
plt.show()


df['ticker'] = df['ticker'].replace({'--': np.nan})


# resulting dataframe should have 0 rows, .shape[0] should be equal to 0
df[df['ticker'] == '--'].shape[0] == 0

True


# first call .value_counts(), then .plot(), so graph is easier to read
df['amount_cleaned'].value_counts().plot(kind = 'bar', \
    title = 'How many transactions reps make, by size', ylabel = 'Number of transactions', \
    xlabel = 'Transaction Amount (averaged)', figsize=(8,6));


df[['amount_cleaned', 'cap_gains_over_200_usd']].value_counts().plot(kind = 'barh', figsize=(10,6))
plt.xlabel("Count")
plt.ylabel("(amount, capital gains over $200)")
plt.show()


relevant_non_disclosure_periods = df.loc[(df['non_disclosure_period(days)'] > 0)]\
    [['amount_cleaned', 'non_disclosure_period(days)']]
relevant_non_disclosure_periods.plot(kind='scatter', x='amount_cleaned', \
    y = 'non_disclosure_period(days)', figsize=(10,6));


closer_look = relevant_non_disclosure_periods[relevant_non_disclosure_periods['amount_cleaned'] < 2_000_000]
closer_look.plot(kind='scatter', x='amount_cleaned', y = 'non_disclosure_period(days)', figsize=(10,6));


(
    df.pivot_table(index = 'representative', values = 'cap_gains_over_200_usd')
    .sort_values(by='cap_gains_over_200_usd', ascending=False)
)


df[
    (df['representative'] == 'Patrick T. McHenry') |
    (df['representative'] == 'Mr. TJ John (Tj) Cox') |
    (df['representative'] == 'Tim Burchett')
]


(
    df.pivot_table(index = 'state', values = 'cap_gains_over_200_usd')
    .sort_values(by = 'cap_gains_over_200_usd', ascending=False)
    .head()
)


df.pivot_table(index='amount_cleaned', values='non_disclosure_period(days)', columns='type')


df.pivot_table(index='amount_cleaned', values='non_disclosure_period(days)', columns='type').mean()

type
exchange        51.582288
purchase        40.964468
sale_full       38.550314
sale_partial    39.663331
dtype: float64


df_copy = df.copy()
# assigning `owner_missing` column to True if owner val is missing; else False
df_copy['owner_missing'] = df_copy['owner'].isna()


cols_to_choose = 'disclosure_year amount_cleaned'.split(" ")


new_dict = {}

for col in cols_to_choose:
    # when 'owner' is missing 
    col_owner_mis = df_copy.loc[df_copy['owner_missing'], col]

    # when 'owner' is not missing
    col_owner_not_mis = df_copy.loc[~df_copy['owner_missing'], col]

    # ks_2samp will perform Kolmogorov-Smirnov test for goodness of fit
    val = stats.ks_2samp(col_owner_mis, col_owner_not_mis)

    new_dict[col] = val
    
new_dict

{'disclosure_year': KstestResult(statistic=0.10445941940037393, pvalue=4.373019393958429e-32),
 'amount_cleaned': KstestResult(statistic=0.02360406573781591, pvalue=0.04792107763601705)}


# making a copy so we don't modify the original df
shuffled = df.copy()
# again assigning `owner_missing` column to True if owner val is missing; else False
shuffled['owner_missing'] = shuffled['owner'].isna()
tvds = []
for _ in range(500):
    # shuffles the values in the district column and puts it back to the df 
    shuffled['state'] = np.random.permutation(shuffled['state'])    
    # resulting df will have 2 rows; one for where `owner` val is missing and another one for when `owner` val is not missing.
    # the columns are the district
    pivoted = (
        shuffled
        .pivot_table(index='owner_missing', columns='state', aggfunc='size')
        .apply(lambda x: x / x.sum(), axis=1)
    )
    tvd = pivoted.diff().iloc[:, -1].abs().sum() / 2
    tvds.append(tvd)


df_copy = df.copy()
dist = (
    df_copy
    .assign(owner_missing=df_copy['owner'].isna())
    .pivot_table(index='state', columns='owner_missing', aggfunc='size')
)
dist = dist / dist.sum()
obs_tvd = dist.diff(axis=1).iloc[:, -1].abs().sum() / 2
obs_tvd

0.4196815040820534


pval = np.mean(tvds >= obs_tvd)
pval

0.0


(
    df.pivot_table(index = 'state', values = 'cap_gains_over_200_usd')
    .sort_values(by = 'cap_gains_over_200_usd', ascending=False)
    .head()
)


# let's only choose the columns we need 
ptest_df = df[['state', 'cap_gains_over_200_usd']]

# seperate into 2 groups
ptest_df_MI = ptest_df.loc[ptest_df['state'] == 'MI']
ptest_df_NOTMI = ptest_df.loc[ptest_df['state'] != 'MI']

# calculate difference of means
observed_diff_means = ptest_df_MI['cap_gains_over_200_usd'].mean() - \
                        ptest_df_NOTMI['cap_gains_over_200_usd'].mean()
observed_diff_means

0.7210881633608198


diff_means = []
N = 500
for _ in range(N):

    # shuffle the cap_gains_over_200_usd column
    shuffled_gains = (
        ptest_df['cap_gains_over_200_usd']
        .sample(frac=1)
        # we need to reset_index w/ drop=True otherwise the old index values will exist
        .reset_index(drop=True)
    )

    # assign a new dataframe with the shuffled column
    ptest_df_shuffled = (
        ptest_df.assign(**{'shuffled_cap_gains_over_200_usd': shuffled_gains})
    )

    # Michigan only values
    MI_only = (
        ptest_df_shuffled.loc[ptest_df_shuffled['state'] == 'MI']
    )

    # NON-Michigan only values
    not_MI = (
        ptest_df_shuffled.loc[ptest_df_shuffled['state'] != 'MI']
    )

    # compute difference in means by subtracting MI-only group's mean with
    # NON-MI-only group's mean.
    test_stat = (
        MI_only['shuffled_cap_gains_over_200_usd'].mean() - \
            not_MI['shuffled_cap_gains_over_200_usd'].mean()
    )
    diff_means.append(test_stat)


p_val = np.mean(diff_means >= observed_diff_means)
p_val

0.0

	disclosure_year	disclosure_date	transaction_date	owner	ticker	asset_description	type	amount
0	2021	10/04/2021	2021-09-27	joint	BP	BP plc	purchase	$1,001 - $15,000
1	2021	10/04/2021	2021-09-13	joint	XOM	Exxon Mobil Corporation	purchase	$1,001 - $15,000
2	2021	10/04/2021	2021-09-10	joint	ILPT	Industrial Logistics Properties Trust - Common...	purchase	$15,001 - $50,000
3	2021	10/04/2021	2021-09-28	joint	PM	Phillip Morris International Inc	purchase	$15,001 - $50,000
4	2021	10/04/2021	2021-09-17	self	BLK	BlackRock Inc	sale_partial	$1,001 - $15,000

	representative	district	ptr_link	cap_gains_over_200_usd
0	Hon. Virginia Foxx	NC05	https://disclosures-clerk.house.gov/public_dis...	False
1	Hon. Virginia Foxx	NC05	https://disclosures-clerk.house.gov/public_dis...	False
2	Hon. Virginia Foxx	NC05	https://disclosures-clerk.house.gov/public_dis...	False
3	Hon. Virginia Foxx	NC05	https://disclosures-clerk.house.gov/public_dis...	False
4	Hon. Alan S. Lowenthal	CA47	https://disclosures-clerk.house.gov/public_dis...	False

	state
0	NC
1	NC
2	NC
3	NC
4	CA

	non_disclosure_period(days)
0	7.0
1	21.0
2	24.0
3	6.0
4	17.0

	cap_gains_over_200_usd
representative
Patrick T. McHenry	1.000000
Mr. TJ John (Tj) Cox	1.000000
Tim Burchett	1.000000
Mr. Peter Meijer	0.902256
Bradley S. Schneider	0.777778
...	...
Harold Dallas Rogers	0.000000
Harley E. Rouda	0.000000
Gus M. Bilirakis	0.000000
Greg Steube	0.000000
Katherine M. Clark	0.000000

Sussy Stock Trades¶

Prabina Pokharel, Atharva Kulkarni¶

Summary of Findings¶

Introduction¶

Cleaning and EDA¶

Assessment of Missingness¶

Hypothesis Test¶

Code¶

Cleaning and EDA¶

Data Cleaning¶

EDA - Univariate Analysis¶

EDA - Bivariate Analysis¶

Interesting Aggregates / Pivot Tables¶

Assessment of Missingness¶

Follwing are all the columns that have one or more null values:¶

Let's begin permutation testing!¶

Hypothesis Testing¶

There's an easy solution to this problem - Permutation Testing!¶

	disclosure_year	disclosure_date	transaction_date	owner	ticker	asset_description	type	amount	representative	district	ptr_link	cap_gains_over_200_usd	state	amount_cleaned	non_disclosure_period(days)
56	2020	2020-09-22	2020-08-17	NaN	NaN	Metallic Minerals Corp.	sale_partial	$100,001 - $250,000	Mr. TJ John (Tj) Cox	CA21	https://disclosures-clerk.house.gov/public_dis...	True	CA	175000.5	36.0
10854	2021	2021-01-10	2020-11-19	NaN	D	Sale of shares in Dominion Energy Inc.	sale_full	$15,001 - $50,000	Patrick T. McHenry	NC10	https://disclosures-clerk.house.gov/public_dis...	True	NC	32500.5	52.0
13381	2020	2020-02-26	2020-02-12	NaN	DENN	Denny's Corporation	sale_full	$1,001 - $15,000	Tim Burchett	TN02	https://disclosures-clerk.house.gov/public_dis...	True	TN	8000.5	14.0

type	exchange	purchase	sale_full	sale_partial
amount_cleaned
8000.00	NaN	NaN	19.000000	19.000000
8000.50	149.081967	65.273784	73.570919	57.616108
32000.00	NaN	NaN	19.000000	19.000000
32500.50	35.285714	49.260524	44.978177	80.050562
53267.76	19.000000	10.752688	39.545455	13.031746
75000.50	40.875000	42.727506	42.405738	63.606557
175000.50	40.500000	25.249097	53.600917	40.725000
375000.50	33.333333	27.622754	37.950311	47.933333
3000000.00	NaN	NaN	23.000000	NaN
3000000.50	43.000000	29.210526	21.000000	25.666667
6163265.79	NaN	97.333333	54.769231	30.166667
15000000.50	NaN	21.250000	35.333333	39.500000
50000000.00	NaN	NaN	37.000000	NaN

	cap_gains_over_200_usd
state
MI	0.772871
SC	0.379310
NY	0.280702
ID	0.222222
LA	0.222222

	cap_gains_over_200_usd
state
MI	0.772871
SC	0.379310
NY	0.280702
ID	0.222222
LA	0.222222