Archived
1
0
This repository has been archived on 2025-04-27. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
schools/main/analysis.ipynb

636 lines
22 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from great_schools import get_nearby_schools\n",
"from distance import get_distance\n",
"from secret import get_key\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Shaun and Daniela's Boston Public School Analysis\n",
"#### 2021.04.10"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Fetch the API key from the local filesystem."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"# get the API key\n",
"api_key_file = '../keys/api.key'\n",
"api_key = get_key(api_key_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use the `nearby_schools` API endpoint to grab raw data of all schools within the maximum radius"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# Some columns will dropped immediately as pre-processing.\n",
"drops = [\n",
" 'nces-id',\n",
" 'school-summary',\n",
" 'street',\n",
" 'fipscounty',\n",
" 'phone',\n",
" 'fax',\n",
" 'web-site',\n",
" 'overview-url',\n",
" 'rating-description',\n",
" 'distance',\n",
"]\n",
"\n",
"# Grab data for Boston.\n",
"refresh = False\n",
"boston_nearby_schools_file = '../data/nearby_schools/boston.csv'\n",
"if refresh:\n",
" boston_schools = get_nearby_schools(api_key,\"42.3\",\"-71.2\",\"50\")\n",
" boston_df = pd.DataFrame.from_dict(boston_schools)\n",
" boston_df.drop(columns=drops,inplace=True)\n",
" boston_df.to_csv(boston_nearby_schools_file, )\n",
"else:\n",
" boston_df = pd.read_csv(boston_nearby_schools_file)\n",
" boston_df.set_index(keys=[\"universal-id\"], drop=True, inplace=True)\n",
" boston_df.drop(columns=[\"Unnamed: 0\"], inplace=True)\n",
"\n",
"# Grab data for Buffalo.\n",
"refresh = False\n",
"buffalo_nearby_schools_file = '../data/nearby_schools/buffalo.csv'\n",
"if refresh:\n",
" buffalo_schools = get_nearby_schools(api_key,\"42.9625\",\"-78.7425\",\"50\")\n",
" buffalo_df = pd.DataFrame.from_dict(buffalo_schools)\n",
" buffalo_df.drop(columns=drops,inplace=True)\n",
" buffalo_df.to_csv(buffalo_nearby_schools_file)\n",
"else:\n",
" buffalo_df = pd.read_csv(buffalo_nearby_schools_file)\n",
" buffalo_df.set_index(keys=[\"universal-id\"], drop=True, inplace=True)\n",
" buffalo_df.drop(columns=[\"Unnamed: 0\"], inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Process the `lat` and `lon` columns from the API output into tuples.\n",
"\n",
"Then create two new columns:\n",
"- Distance to Downtown\n",
"- Distance to Work"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"# Form tuple to represent coordinates\n",
"boston_df['coordinates'] = list(zip(boston_df.lat,boston_df.lon))\n",
"#boston_df.drop(columns=['lat', 'lon'], inplace=True)\n",
"\n",
"# Define coordinates of important places\n",
"downtown=(42.3674836866797, -71.07134540735377) # Science Museum\n",
"work=(42.47381059540949, -71.25414135292398) # Hartwell\n",
"\n",
"# Create new columns to tabulate distance to these important places\n",
"boston_df['distance-to-downtown'] = boston_df['coordinates'].apply(func=get_distance,p2=downtown)\n",
"boston_df['distance-to-work'] = boston_df['coordinates'].apply(func=get_distance,p2=work)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We should definitely removal all schools that aren't in Massachusetts."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are 1789 schools from the original API results.\n",
"Allowing only schools from Massachusetts reduces the dataset to 1375 schools.\n"
]
}
],
"source": [
"print(f'There are {len(boston_df)} schools from the original API results.')\n",
"\n",
"# only allow from MA\n",
"boston_df = boston_df[boston_df['state'] == \"MA\"]\n",
"print(f'Allowing only schools from Massachusetts reduces the dataset to {len(boston_df)} schools.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"How many unique district id's are there?"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"There are 230 unique school districts.\n",
"\n"
]
}
],
"source": [
"# get unique districts\n",
"districts = boston_df[\"district-id\"].unique()\n",
"print(f'\\nThere are {len(districts)} unique school districts.\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Which of these districts are close to both work and downtown boston?"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are 116 school districts within reasonable proximity to downtown and work.\n",
"\n",
"There are 820 schools within these proximal districts.\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state-id</th>\n",
" <th>name</th>\n",
" <th>type</th>\n",
" <th>level-codes</th>\n",
" <th>level</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip</th>\n",
" <th>county</th>\n",
" <th>lat</th>\n",
" <th>lon</th>\n",
" <th>district-name</th>\n",
" <th>district-id</th>\n",
" <th>rating</th>\n",
" <th>year</th>\n",
" <th>coordinates</th>\n",
" <th>distance-to-downtown</th>\n",
" <th>distance-to-work</th>\n",
" </tr>\n",
" <tr>\n",
" <th>universal-id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2500363</th>\n",
" <td>380013</td>\n",
" <td>Spofford Pond</td>\n",
" <td>public</td>\n",
" <td>e</td>\n",
" <td>3,4,5,6</td>\n",
" <td>Boxford</td>\n",
" <td>MA</td>\n",
" <td>1921</td>\n",
" <td>Essex County</td>\n",
" <td>42.697018</td>\n",
" <td>-71.017365</td>\n",
" <td>Boxford School District</td>\n",
" <td>102</td>\n",
" <td>7.0</td>\n",
" <td>2021.0</td>\n",
" <td>(42.697018, -71.017365)</td>\n",
" <td>22.917933</td>\n",
" <td>19.554889</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2506356</th>\n",
" <td>100305</td>\n",
" <td>Gibbs School</td>\n",
" <td>public</td>\n",
" <td>e,m</td>\n",
" <td>6</td>\n",
" <td>Arlington</td>\n",
" <td>MA</td>\n",
" <td>2474</td>\n",
" <td>Middlesex County</td>\n",
" <td>42.410576</td>\n",
" <td>-71.145081</td>\n",
" <td>Arlington Public Schools</td>\n",
" <td>69</td>\n",
" <td>7.0</td>\n",
" <td>2021.0</td>\n",
" <td>(42.410576, -71.145081)</td>\n",
" <td>4.794958</td>\n",
" <td>7.066929</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2501835</th>\n",
" <td>3470410</td>\n",
" <td>Daniel L Joyce Middle School</td>\n",
" <td>public</td>\n",
" <td>m</td>\n",
" <td>6,7,8</td>\n",
" <td>Woburn</td>\n",
" <td>MA</td>\n",
" <td>1801</td>\n",
" <td>Middlesex County</td>\n",
" <td>42.477467</td>\n",
" <td>-71.175484</td>\n",
" <td>Woburn School District</td>\n",
" <td>467</td>\n",
" <td>4.0</td>\n",
" <td>2021.0</td>\n",
" <td>(42.477467, -71.175484)</td>\n",
" <td>9.264922</td>\n",
" <td>4.013598</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2501714</th>\n",
" <td>3150005</td>\n",
" <td>Claypit Hill School</td>\n",
" <td>public</td>\n",
" <td>e</td>\n",
" <td>KG,1,2,3,4,5</td>\n",
" <td>Wayland</td>\n",
" <td>MA</td>\n",
" <td>1778</td>\n",
" <td>Middlesex County</td>\n",
" <td>42.373108</td>\n",
" <td>-71.344765</td>\n",
" <td>Wayland School District</td>\n",
" <td>434</td>\n",
" <td>8.0</td>\n",
" <td>2021.0</td>\n",
" <td>(42.373108, -71.344765)</td>\n",
" <td>13.952791</td>\n",
" <td>8.347379</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2502631</th>\n",
" <td>1810055</td>\n",
" <td>Tenney Grammar School</td>\n",
" <td>public</td>\n",
" <td>p,e,m</td>\n",
" <td>PK,KG,1,2,3,4,5,6,7,8</td>\n",
" <td>Methuen</td>\n",
" <td>MA</td>\n",
" <td>1844</td>\n",
" <td>Essex County</td>\n",
" <td>42.732357</td>\n",
" <td>-71.177345</td>\n",
" <td>Methuen School District</td>\n",
" <td>270</td>\n",
" <td>3.0</td>\n",
" <td>2021.0</td>\n",
" <td>(42.732357, -71.177345)</td>\n",
" <td>25.763243</td>\n",
" <td>18.273064</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2500515</th>\n",
" <td>710505</td>\n",
" <td>Danvers High School</td>\n",
" <td>public</td>\n",
" <td>h</td>\n",
" <td>9,10,11,12,UG</td>\n",
" <td>Danvers</td>\n",
" <td>MA</td>\n",
" <td>1923</td>\n",
" <td>Essex County</td>\n",
" <td>42.582523</td>\n",
" <td>-70.931618</td>\n",
" <td>Danvers School District</td>\n",
" <td>141</td>\n",
" <td>6.0</td>\n",
" <td>2021.0</td>\n",
" <td>(42.582523, -70.931618)</td>\n",
" <td>16.464503</td>\n",
" <td>18.045917</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2501498</th>\n",
" <td>2740410</td>\n",
" <td>Next Wave Junior High School</td>\n",
" <td>public</td>\n",
" <td>m</td>\n",
" <td>7,8</td>\n",
" <td>Somerville</td>\n",
" <td>MA</td>\n",
" <td>2145</td>\n",
" <td>Middlesex County</td>\n",
" <td>42.387581</td>\n",
" <td>-71.087326</td>\n",
" <td>Somerville School District</td>\n",
" <td>383</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>(42.387581, -71.087326)</td>\n",
" <td>1.609308</td>\n",
" <td>10.378716</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2501384</th>\n",
" <td>2430310</td>\n",
" <td>Broad Meadows Middle School</td>\n",
" <td>public</td>\n",
" <td>m</td>\n",
" <td>6,7,8</td>\n",
" <td>Quincy</td>\n",
" <td>MA</td>\n",
" <td>2169</td>\n",
" <td>Norfolk County</td>\n",
" <td>42.259659</td>\n",
" <td>-70.985237</td>\n",
" <td>Quincy School District</td>\n",
" <td>349</td>\n",
" <td>4.0</td>\n",
" <td>2021.0</td>\n",
" <td>(42.259659, -70.985237)</td>\n",
" <td>8.646003</td>\n",
" <td>20.169491</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2500916</th>\n",
" <td>1570006</td>\n",
" <td>Hanscom Primary School</td>\n",
" <td>public</td>\n",
" <td>p,e</td>\n",
" <td>PK,KG,1,2,3</td>\n",
" <td>Hanscom Air Force Bs</td>\n",
" <td>MA</td>\n",
" <td>1731</td>\n",
" <td>Middlesex County</td>\n",
" <td>42.456898</td>\n",
" <td>-71.278549</td>\n",
" <td>Lincoln School District</td>\n",
" <td>242</td>\n",
" <td>3.0</td>\n",
" <td>2021.0</td>\n",
" <td>(42.456898, -71.278549)</td>\n",
" <td>12.234463</td>\n",
" <td>1.705602</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2501788</th>\n",
" <td>3360065</td>\n",
" <td>Lawrence W Pingree</td>\n",
" <td>public</td>\n",
" <td>e</td>\n",
" <td>KG,1,2,3,4</td>\n",
" <td>Weymouth</td>\n",
" <td>MA</td>\n",
" <td>2189</td>\n",
" <td>Norfolk County</td>\n",
" <td>42.217670</td>\n",
" <td>-70.925240</td>\n",
" <td>Weymouth School District</td>\n",
" <td>455</td>\n",
" <td>8.0</td>\n",
" <td>2021.0</td>\n",
" <td>(42.21767, -70.92524)</td>\n",
" <td>12.754639</td>\n",
" <td>24.381842</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state-id name type level-codes \\\n",
"universal-id \n",
"2500363 380013 Spofford Pond public e \n",
"2506356 100305 Gibbs School public e,m \n",
"2501835 3470410 Daniel L Joyce Middle School public m \n",
"2501714 3150005 Claypit Hill School public e \n",
"2502631 1810055 Tenney Grammar School public p,e,m \n",
"2500515 710505 Danvers High School public h \n",
"2501498 2740410 Next Wave Junior High School public m \n",
"2501384 2430310 Broad Meadows Middle School public m \n",
"2500916 1570006 Hanscom Primary School public p,e \n",
"2501788 3360065 Lawrence W Pingree public e \n",
"\n",
" level city state zip \\\n",
"universal-id \n",
"2500363 3,4,5,6 Boxford MA 1921 \n",
"2506356 6 Arlington MA 2474 \n",
"2501835 6,7,8 Woburn MA 1801 \n",
"2501714 KG,1,2,3,4,5 Wayland MA 1778 \n",
"2502631 PK,KG,1,2,3,4,5,6,7,8 Methuen MA 1844 \n",
"2500515 9,10,11,12,UG Danvers MA 1923 \n",
"2501498 7,8 Somerville MA 2145 \n",
"2501384 6,7,8 Quincy MA 2169 \n",
"2500916 PK,KG,1,2,3 Hanscom Air Force Bs MA 1731 \n",
"2501788 KG,1,2,3,4 Weymouth MA 2189 \n",
"\n",
" county lat lon \\\n",
"universal-id \n",
"2500363 Essex County 42.697018 -71.017365 \n",
"2506356 Middlesex County 42.410576 -71.145081 \n",
"2501835 Middlesex County 42.477467 -71.175484 \n",
"2501714 Middlesex County 42.373108 -71.344765 \n",
"2502631 Essex County 42.732357 -71.177345 \n",
"2500515 Essex County 42.582523 -70.931618 \n",
"2501498 Middlesex County 42.387581 -71.087326 \n",
"2501384 Norfolk County 42.259659 -70.985237 \n",
"2500916 Middlesex County 42.456898 -71.278549 \n",
"2501788 Norfolk County 42.217670 -70.925240 \n",
"\n",
" district-name district-id rating year \\\n",
"universal-id \n",
"2500363 Boxford School District 102 7.0 2021.0 \n",
"2506356 Arlington Public Schools 69 7.0 2021.0 \n",
"2501835 Woburn School District 467 4.0 2021.0 \n",
"2501714 Wayland School District 434 8.0 2021.0 \n",
"2502631 Methuen School District 270 3.0 2021.0 \n",
"2500515 Danvers School District 141 6.0 2021.0 \n",
"2501498 Somerville School District 383 NaN NaN \n",
"2501384 Quincy School District 349 4.0 2021.0 \n",
"2500916 Lincoln School District 242 3.0 2021.0 \n",
"2501788 Weymouth School District 455 8.0 2021.0 \n",
"\n",
" coordinates distance-to-downtown distance-to-work \n",
"universal-id \n",
"2500363 (42.697018, -71.017365) 22.917933 19.554889 \n",
"2506356 (42.410576, -71.145081) 4.794958 7.066929 \n",
"2501835 (42.477467, -71.175484) 9.264922 4.013598 \n",
"2501714 (42.373108, -71.344765) 13.952791 8.347379 \n",
"2502631 (42.732357, -71.177345) 25.763243 18.273064 \n",
"2500515 (42.582523, -70.931618) 16.464503 18.045917 \n",
"2501498 (42.387581, -71.087326) 1.609308 10.378716 \n",
"2501384 (42.259659, -70.985237) 8.646003 20.169491 \n",
"2500916 (42.456898, -71.278549) 12.234463 1.705602 \n",
"2501788 (42.21767, -70.92524) 12.754639 24.381842 "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# calculate distance to PoI using geo-center of districts\n",
"distances_to_downtown = {k: np.mean(list(v)) for k, v in boston_df.groupby('district-id')['distance-to-downtown']}\n",
"distances_to_work = {k: np.mean(list(v)) for k, v in boston_df.groupby('district-id')['distance-to-work']}\n",
"\n",
"df_downtown = pd.DataFrame.from_dict(distances_to_downtown, orient='index')\n",
"df_work = pd.DataFrame.from_dict(distances_to_work, orient='index')\n",
"\n",
"# merge these new columns\n",
"both_df = pd.merge(left=df_downtown, right=df_work, how='inner', left_index=True, right_index=True)\n",
"both_df.rename(columns={'0_x': \"downtown\", '0_y': \"work\"}, inplace=True)\n",
"\n",
"both_df = both_df[both_df[\"downtown\"] < 35.0]\n",
"both_df = both_df[both_df[\"work\"] < 25.0]\n",
"\n",
"print(f'There are {len(both_df)} school districts within reasonable proximity to downtown and work.\\n')\n",
"\n",
"# filter out all schools which aren't in proximal districts\n",
"proximal_district_ids = list(both_df.index)\n",
"boston_df = boston_df[boston_df['district-id'].isin(proximal_district_ids)]\n",
"\n",
"print(f'There are {len(boston_df)} schools within these proximal districts.\\n')\n",
"\n",
"boston_df.sample(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some of these districts don't have enough rating data. Those should be dropped."
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f54f95addf0>"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"boston_df.groupby(['district-id'])"
]
}
],
"metadata": {
"interpreter": {
"hash": "4fc861b332db140b7b363b167627eee6a3238262e7c99e0237067fec0875fee7"
},
"kernelspec": {
"display_name": "Python 3.8.10 ('venv': venv)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}