CENSUS_PEP_URL = "https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv"
CENSUS_PEP_ARCHIVE_PATH = "./input_data/census_pep.csv"


CENSUS_COUNTY_POLYGONS_URL = "https://www2.census.gov/geo/tiger/TIGER2019/COUNTY/tl_2019_us_county.zip"
CENSUS_COUNTY_POLYGONS_ARCHIVE_PATH = "./input_data/county_polygons.shp"


POPCHANGE_FEATURECLASS_PATH = "./output_data/popchange.shp"


POPCHG_FEATURECLASS_SCHEMA = {
    "type": "record",
    "doc": "Numerical population change for Central Ohio counties for the years 2010 to 2019.",
    "fields": [
        {"name":"CTYNAME", "type":"string", "doc":"Name of the county"},
        {"name":"2010", "type":"int", "doc":"Numeric change in resident total population 4/1/2010 to 7/1/2010"},
        {"name":"2011", "type":"int", "doc":"Numeric change in resident total population 7/1/2010 to 7/1/2011"},
        {"name":"2012", "type":"int", "doc":"Numeric change in resident total population 7/1/2011 to 7/1/2012"},
        {"name":"2013", "type":"int", "doc":"Numeric change in resident total population 7/1/2012 to 7/1/2013"},
        {"name":"2014", "type":"int", "doc":"Numeric change in resident total population 7/1/2013 to 7/1/2014"},
        {"name":"2015", "type":"int", "doc":"Numeric change in resident total population 7/1/2014 to 7/1/2015"},
        {"name":"2016", "type":"int", "doc":"Numeric change in resident total population 7/1/2015 to 7/1/2016"},
        {"name":"2017", "type":"int", "doc":"Numeric change in resident total population 7/1/2016 to 7/1/2017"},
        {"name":"2018", "type":"int", "doc":"Numeric change in resident total population 7/1/2017 to 7/1/2018"},
        {"name":"2019", "type":"int", "doc":"Numeric change in resident total population 7/1/2018 to 7/1/2019"}
    ]
}


# For COUNTY_NAMES, enter a list of strings representing the names of the counties of interest
COUNTY_NAMES = \
    ["Franklin","Fairfield","Pickaway","Madison", 
     "Union", "Delaware", "Licking"]

# For STATE_NAME, enter a string representing name of the state where the counties of interest
# are located.  For STATE_FIPS, enter the two digit FIPS code assigned to the state by the Census Bureau.
STATE_NAME = "Ohio"
STATE_FIPS = "39"


import os                  # Perform basic filesystem operations, such as creating directories
import pandas as pd        # Create and manipulate tabular data in the form of dataframes
import geopandas as gpd    # Create and manipulate spatial data in the form of geodataframes
from scipy import stats    # Perform statistical analysis
import numpy as np         # General purpose numerical computation library


if not os.path.exists("./input_data"):
    os.makedirs("./input_data")
    
if not os.path.exists("./output_data"):
    os.makedirs("./output_data")


censusPepRaw = pd.read_csv(CENSUS_PEP_URL, encoding="ISO-8859-1")
censusPepRaw.to_csv(CENSUS_PEP_ARCHIVE_PATH, index=False)
# If you need to load the archival copy, use the following line instead
#censusPepRaw = pd.read_csv(CENSUS_PEP_ARCHIVE_PATH, encoding="ISO-8859-1")
censusPepRaw.head()


censusPep = censusPepRaw.loc[ \
    (censusPepRaw["SUMLEV"] == 50) & \
    (censusPepRaw["STNAME"] == STATE_NAME) & \
    (censusPepRaw["CTYNAME"].isin(["{} County".format(x) for x in COUNTY_NAMES]))
].copy()
censusPep.head()


print(", ".join(list(censusPep.columns)))

SUMLEV, REGION, DIVISION, STATE, COUNTY, STNAME, CTYNAME, CENSUS2010POP, ESTIMATESBASE2010, POPESTIMATE2010, POPESTIMATE2011, POPESTIMATE2012, POPESTIMATE2013, POPESTIMATE2014, POPESTIMATE2015, POPESTIMATE2016, POPESTIMATE2017, POPESTIMATE2018, POPESTIMATE2019, NPOPCHG_2010, NPOPCHG_2011, NPOPCHG_2012, NPOPCHG_2013, NPOPCHG_2014, NPOPCHG_2015, NPOPCHG_2016, NPOPCHG_2017, NPOPCHG_2018, NPOPCHG_2019, BIRTHS2010, BIRTHS2011, BIRTHS2012, BIRTHS2013, BIRTHS2014, BIRTHS2015, BIRTHS2016, BIRTHS2017, BIRTHS2018, BIRTHS2019, DEATHS2010, DEATHS2011, DEATHS2012, DEATHS2013, DEATHS2014, DEATHS2015, DEATHS2016, DEATHS2017, DEATHS2018, DEATHS2019, NATURALINC2010, NATURALINC2011, NATURALINC2012, NATURALINC2013, NATURALINC2014, NATURALINC2015, NATURALINC2016, NATURALINC2017, NATURALINC2018, NATURALINC2019, INTERNATIONALMIG2010, INTERNATIONALMIG2011, INTERNATIONALMIG2012, INTERNATIONALMIG2013, INTERNATIONALMIG2014, INTERNATIONALMIG2015, INTERNATIONALMIG2016, INTERNATIONALMIG2017, INTERNATIONALMIG2018, INTERNATIONALMIG2019, DOMESTICMIG2010, DOMESTICMIG2011, DOMESTICMIG2012, DOMESTICMIG2013, DOMESTICMIG2014, DOMESTICMIG2015, DOMESTICMIG2016, DOMESTICMIG2017, DOMESTICMIG2018, DOMESTICMIG2019, NETMIG2010, NETMIG2011, NETMIG2012, NETMIG2013, NETMIG2014, NETMIG2015, NETMIG2016, NETMIG2017, NETMIG2018, NETMIG2019, RESIDUAL2010, RESIDUAL2011, RESIDUAL2012, RESIDUAL2013, RESIDUAL2014, RESIDUAL2015, RESIDUAL2016, RESIDUAL2017, RESIDUAL2018, RESIDUAL2019, GQESTIMATESBASE2010, GQESTIMATES2010, GQESTIMATES2011, GQESTIMATES2012, GQESTIMATES2013, GQESTIMATES2014, GQESTIMATES2015, GQESTIMATES2016, GQESTIMATES2017, GQESTIMATES2018, GQESTIMATES2019, RBIRTH2011, RBIRTH2012, RBIRTH2013, RBIRTH2014, RBIRTH2015, RBIRTH2016, RBIRTH2017, RBIRTH2018, RBIRTH2019, RDEATH2011, RDEATH2012, RDEATH2013, RDEATH2014, RDEATH2015, RDEATH2016, RDEATH2017, RDEATH2018, RDEATH2019, RNATURALINC2011, RNATURALINC2012, RNATURALINC2013, RNATURALINC2014, RNATURALINC2015, RNATURALINC2016, RNATURALINC2017, RNATURALINC2018, RNATURALINC2019, RINTERNATIONALMIG2011, RINTERNATIONALMIG2012, RINTERNATIONALMIG2013, RINTERNATIONALMIG2014, RINTERNATIONALMIG2015, RINTERNATIONALMIG2016, RINTERNATIONALMIG2017, RINTERNATIONALMIG2018, RINTERNATIONALMIG2019, RDOMESTICMIG2011, RDOMESTICMIG2012, RDOMESTICMIG2013, RDOMESTICMIG2014, RDOMESTICMIG2015, RDOMESTICMIG2016, RDOMESTICMIG2017, RDOMESTICMIG2018, RDOMESTICMIG2019, RNETMIG2011, RNETMIG2012, RNETMIG2013, RNETMIG2014, RNETMIG2015, RNETMIG2016, RNETMIG2017, RNETMIG2018, RNETMIG2019


censusPepPopChg = pd.concat([
    censusPep.filter(like="CTYNAME", axis="columns"), 
    censusPep.filter(like="NPOPCHG", axis="columns")
], axis="columns")
censusPepPopChg.head()


censusPepPopChg["CTYNAME"] = censusPepPopChg["CTYNAME"].str.replace(" County", "")


censusPepPopChg = censusPepPopChg \
    .set_index("CTYNAME") \
    .rename(columns=(lambda x:x[-4:]))
censusPepPopChg.head()


import matplotlib.pyplot as plt
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
censusPepPopChg["2019"].plot.barh(ax=axes[0], title="2019 population change by county", xlabel="")
censusPepPopChg.loc["Franklin"].plot(ax=axes[1], title="Franklin County population change by year", style="o-")

<AxesSubplot:title={'center':'Franklin County population change by year'}>


censusCountyPolysRaw = gpd.read_file(CENSUS_COUNTY_POLYGONS_URL)
censusCountyPolysRaw.to_file(CENSUS_COUNTY_POLYGONS_ARCHIVE_PATH, driver="ESRI Shapefile")
# If you need to load the archival copy, use the following line instead
# censusCountyPolysRaw = gpd.read_file(CENSUS_COUNTY_POLYGONS_ARCHIVE_PATH)
censusCountyPolysRaw.head()


censusCountyPolys = censusCountyPolysRaw.loc[ \
    (censusCountyPolysRaw["STATEFP"] == STATE_FIPS) & \
    (censusCountyPolysRaw["NAMELSAD"].isin(["{} County".format(x) for x in COUNTY_NAMES]))
].copy()
censusCountyPolys["NAMELSAD"] = censusCountyPolys["NAMELSAD"].str.replace(" County", "")
censusCountyPolys = censusCountyPolys.set_index(["NAMELSAD"])
censusCountyPolys.head()


censusPepPolys = gpd.GeoDataFrame(data=censusPepPopChg.copy(), geometry=censusCountyPolys["geometry"])
censusPepPolys.head()


censusPepPolys.explore(column="2019")


x = censusPepPopChg.columns.to_series().astype("int")
y = censusPepPopChg.loc["Franklin"]


slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
trendline = slope * x + intercept
print("Slope: {:1.2f}".format(slope))
print("Intercept: {:1.2f}".format(intercept))
print("Standard error: {:1.2f}".format(std_err))
print("Standard error as percent of mean: {:1.2f}%".format(std_err/y.mean()*100))

Slope: 275.31
Intercept: -539282.16
Standard error: 671.80
Standard error as percent of mean: 4.38%


def print_trend_assessments(p_value, alpha, slope):
    if p_value < alpha and slope < 0:
        print("There is significant evidence (p = {:.4f}) of a negative trend".format(p_value))
    else:
        print("There is no significant evidence of a negative trend")
    if p_value < alpha and slope > 0:
        print("There is significant evidence (p = {:.4f}) of a positive trend".format(p_value))
    else:
        print("There is no significant evidence of a positive trend")


t_stat = slope / (std_err / np.sqrt(len(x)))
p_value = stats.t.sf(np.abs(t_stat), len(x)-2)

alpha = 0.05
print_trend_assessments(p_value, alpha, slope)

plt.plot(x,y,"o")
plt.plot(x,trendline)
plt.show()

There is no significant evidence of a negative trend
There is no significant evidence of a positive trend

	SUMLEV	REGION	DIVISION	STATE	COUNTY	STNAME	CTYNAME	CENSUS2010POP	ESTIMATESBASE2010	POPESTIMATE2010	...	RDOMESTICMIG2019	RNETMIG2011	RNETMIG2012	RNETMIG2013	RNETMIG2014	RNETMIG2015	RNETMIG2016	RNETMIG2017	RNETMIG2018	RNETMIG2019
0	40	3	6	1	0	Alabama	Alabama	4779736	4780125	4785437	...	1.917501	0.578434	1.186314	1.522549	0.563489	0.626357	0.745172	1.090366	1.773786	2.483744
1	50	3	6	1	1	Alabama	Autauga County	54571	54597	54773	...	4.847310	6.018182	-6.226119	-3.902226	1.970443	-1.712875	4.777171	0.849656	0.540916	4.560062
2	50	3	6	1	3	Alabama	Baldwin County	182265	182265	183112	...	24.017829	16.641870	17.488579	22.751474	20.184334	17.725964	21.279291	22.398256	24.727215	24.380567
3	50	3	6	1	5	Alabama	Barbour County	27457	27455	27327	...	-5.690302	0.292676	-6.897817	-8.132185	-5.140431	-15.724575	-18.238016	-24.998528	-8.754922	-5.165664
4	50	3	6	1	7	Alabama	Bibb County	22915	22915	22870	...	1.385134	-4.998356	-3.787545	-5.797999	1.331144	1.329817	-0.708717	-3.234669	-6.857092	1.831952

	SUMLEV	REGION	DIVISION	STATE	COUNTY	STNAME	CTYNAME	CENSUS2010POP	ESTIMATESBASE2010	POPESTIMATE2010	...	RDOMESTICMIG2019	RNETMIG2011	RNETMIG2012	RNETMIG2013	RNETMIG2014	RNETMIG2015	RNETMIG2016	RNETMIG2017	RNETMIG2018	RNETMIG2019
2099	50	2	3	39	41	Ohio	Delaware County	174214	174172	175099	...	14.546139	12.775921	7.779526	16.534473	14.817779	14.727125	13.806701	13.677911	16.139481	15.569631
2101	50	2	3	39	45	Ohio	Fairfield County	146156	146194	146417	...	7.398997	1.812045	-2.723491	6.477607	7.561272	3.269079	7.242397	10.153671	5.916284	8.151654
2103	50	2	3	39	49	Ohio	Franklin County	1163414	1163476	1166202	...	-2.261042	4.535147	7.345857	8.393981	7.728997	7.809282	5.912705	9.237498	2.811147	0.478576
2123	50	2	3	39	89	Ohio	Licking County	166492	166482	166705	...	5.026551	-0.197682	0.035847	2.701979	3.131373	5.258499	5.901858	7.372472	9.940364	5.349930
2127	50	2	3	39	97	Ohio	Madison County	43435	43438	43434	...	6.350987	-8.665712	-4.180942	6.887196	15.062589	3.222914	-17.116870	15.374408	7.463530	6.620287

	CTYNAME	NPOPCHG_2010	NPOPCHG_2011	NPOPCHG_2012	NPOPCHG_2013	NPOPCHG_2014	NPOPCHG_2015	NPOPCHG_2016	NPOPCHG_2017	NPOPCHG_2018	NPOPCHG_2019
2099	Delaware County	927	3436	2592	4253	4060	3951	3753	3726	4221	4086
2101	Fairfield County	223	757	127	1495	1564	894	1535	1897	1296	1592
2103	Franklin County	2726	14598	18245	19833	19484	19024	17064	21060	12188	9058
2123	Licking County	223	459	425	872	949	1201	1382	1624	2049	1196
2127	Madison County	-4	-320	-123	265	724	159	-762	664	348	342

	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019
CTYNAME
Delaware	927	3436	2592	4253	4060	3951	3753	3726	4221	4086
Fairfield	223	757	127	1495	1564	894	1535	1897	1296	1592
Franklin	2726	14598	18245	19833	19484	19024	17064	21060	12188	9058
Licking	223	459	425	872	949	1201	1382	1624	2049	1196
Madison	-4	-320	-123	265	724	159	-762	664	348	342

	STATEFP	COUNTYFP	COUNTYNS	GEOID	NAME	NAMELSAD	LSAD	CLASSFP	MTFCC	CSAFP	CBSAFP	METDIVFP	FUNCSTAT	ALAND	AWATER	INTPTLAT	INTPTLON	geometry
0	31	039	00835841	31039	Cuming	Cuming County	06	H1	G4020	None	None	None	A	1477652222	10690952	+41.9158651	-096.7885168	POLYGON ((-97.01952 42.00410, -97.01952 42.004...
1	53	069	01513275	53069	Wahkiakum	Wahkiakum County	06	H1	G4020	None	None	None	A	680962890	61582307	+46.2946377	-123.4244583	POLYGON ((-123.43639 46.23820, -123.44759 46.2...
2	35	011	00933054	35011	De Baca	De Baca County	06	H1	G4020	None	None	None	A	6016819475	29089486	+34.3592729	-104.3686961	POLYGON ((-104.56739 33.99757, -104.56772 33.9...
3	31	109	00835876	31109	Lancaster	Lancaster County	06	H1	G4020	339	30700	None	A	2169270569	22849484	+40.7835474	-096.6886584	POLYGON ((-96.91075 40.78494, -96.91075 40.790...
4	31	129	00835886	31129	Nuckolls	Nuckolls County	06	H1	G4020	None	None	None	A	1489645188	1718484	+40.1764918	-098.0468422	POLYGON ((-98.27367 40.08940, -98.27367 40.089...

Remind me again how we did that? Creating convenient and reproducible workflows using Jupyter Notebook

How I'll make my case¶

Why we need reproducible workflows¶

Have you every been guilty of this?¶

Or this?¶

Or this?¶

Why are we so bad a creating reproducible workflows?¶

The case for programming your analysis¶

Code inherently provides documentation¶

Code does only what you tell it to do¶

Code allows you to repeat steps with minimal effort¶

The case for literate programming¶

A few thoughtful, well-placed comments go a long way¶

But a picture is even better!¶

Sometimes you just need the gist¶

Literate programming reads like a book but runs like code¶

Jupyter makes literate programming possible for mere mortals¶

Literate programming by example¶

Write an introduction¶

Specify the input and output data¶

Input data¶

Output data¶

Don't forget to specify the schemas!¶

Specify any parameters¶

Import required libraries¶

Prepare the environment¶

Finally, let's get some data! Pandas makes working tabular data a dream.¶

Jupyter can create charts.¶

Maps too!¶

And statistical analyses? You bet.¶

Need some nicely-formatted output for the boss? Jupyter has you covered.¶

Closing thoughts¶

Your environment should be reproducible too¶

Use version control¶

You can do all of this in R too¶

No Jupyter? No problem. Binder lets you run Jupyter in the cloud.¶

Jupyter is Free Software (so are Python and R)¶

10 commandments of reproducible data science¶

Accessing the content from this presentation¶

	STATEFP	COUNTYFP	COUNTYNS	GEOID	NAME	LSAD	CLASSFP	MTFCC	CSAFP	CBSAFP	METDIVFP	FUNCSTAT	ALAND	AWATER	INTPTLAT	INTPTLON	geometry
NAMELSAD
Pickaway	39	129	01074077	39129	Pickaway	06	H1	G4020	198	18140	None	A	1298175198	13790233	+39.6489470	-083.0528267	POLYGON ((-83.01307 39.80439, -83.01285 39.804...
Delaware	39	041	01074033	39041	Delaware	06	H1	G4020	198	18140	None	A	1147778026	36698376	+40.2789411	-083.0074622	POLYGON ((-83.19229 40.24440, -83.19672 40.244...
Madison	39	097	01074061	39097	Madison	06	H1	G4020	198	18140	None	A	1206445849	2119319	+39.8966074	-083.4008847	POLYGON ((-83.54053 39.91715, -83.54039 39.917...
Franklin	39	049	01074037	39049	Franklin	06	H1	G4020	198	18140	None	A	1378938272	29041546	+39.9698749	-083.0090858	POLYGON ((-83.01188 40.13656, -83.01171 40.136...
Union	39	159	01074091	39159	Union	06	H1	G4020	198	18140	None	A	1118161949	13254689	+40.2959008	-083.3670416	POLYGON ((-83.52822 40.43664, -83.52776 40.440...