{ "cells": [ { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# imports\n", "import pandas as pd\n", "import numpy as np\n", "\n", "from great_schools import get_nearby_schools\n", "from distance import get_distance\n", "from secret import get_key\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Shaun and Daniela's Boston Public School Analysis\n", "#### 2021.04.10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fetch the API key from the local filesystem." ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# get the API key\n", "api_key_file = '../keys/api.key'\n", "api_key = get_key(api_key_file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use the `nearby_schools` API endpoint to grab raw data of all schools within the maximum radius" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# Some columns will dropped immediately as pre-processing.\n", "drops = [\n", " 'nces-id',\n", " 'school-summary',\n", " 'street',\n", " 'fipscounty',\n", " 'phone',\n", " 'fax',\n", " 'web-site',\n", " 'overview-url',\n", " 'rating-description',\n", " 'distance',\n", "]\n", "\n", "# Grab data for Boston.\n", "refresh = False\n", "boston_nearby_schools_file = '../data/nearby_schools/boston.csv'\n", "if refresh:\n", " boston_schools = get_nearby_schools(api_key,\"42.3\",\"-71.2\",\"50\")\n", " boston_df = pd.DataFrame.from_dict(boston_schools)\n", " boston_df.drop(columns=drops,inplace=True)\n", " boston_df.to_csv(boston_nearby_schools_file, )\n", "else:\n", " boston_df = pd.read_csv(boston_nearby_schools_file)\n", " boston_df.set_index(keys=[\"universal-id\"], drop=True, inplace=True)\n", " boston_df.drop(columns=[\"Unnamed: 0\"], inplace=True)\n", "\n", "# Grab data for Buffalo.\n", "refresh = False\n", "buffalo_nearby_schools_file = '../data/nearby_schools/buffalo.csv'\n", "if refresh:\n", " buffalo_schools = get_nearby_schools(api_key,\"42.9625\",\"-78.7425\",\"50\")\n", " buffalo_df = pd.DataFrame.from_dict(buffalo_schools)\n", " buffalo_df.drop(columns=drops,inplace=True)\n", " buffalo_df.to_csv(buffalo_nearby_schools_file)\n", "else:\n", " buffalo_df = pd.read_csv(buffalo_nearby_schools_file)\n", " buffalo_df.set_index(keys=[\"universal-id\"], drop=True, inplace=True)\n", " buffalo_df.drop(columns=[\"Unnamed: 0\"], inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Process the `lat` and `lon` columns from the API output into tuples.\n", "\n", "Then create two new columns:\n", "- Distance to Downtown\n", "- Distance to Work" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# Form tuple to represent coordinates\n", "boston_df['coordinates'] = list(zip(boston_df.lat,boston_df.lon))\n", "#boston_df.drop(columns=['lat', 'lon'], inplace=True)\n", "\n", "# Define coordinates of important places\n", "downtown=(42.3674836866797, -71.07134540735377) # Science Museum\n", "work=(42.47381059540949, -71.25414135292398) # Hartwell\n", "\n", "# Create new columns to tabulate distance to these important places\n", "boston_df['distance-to-downtown'] = boston_df['coordinates'].apply(func=get_distance,p2=downtown)\n", "boston_df['distance-to-work'] = boston_df['coordinates'].apply(func=get_distance,p2=work)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We should definitely removal all schools that aren't in Massachusetts." ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There are 1789 schools from the original API results.\n", "Allowing only schools from Massachusetts reduces the dataset to 1375 schools.\n" ] } ], "source": [ "print(f'There are {len(boston_df)} schools from the original API results.')\n", "\n", "# only allow from MA\n", "boston_df = boston_df[boston_df['state'] == \"MA\"]\n", "print(f'Allowing only schools from Massachusetts reduces the dataset to {len(boston_df)} schools.')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How many unique district id's are there?" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "There are 230 unique school districts.\n", "\n" ] } ], "source": [ "# get unique districts\n", "districts = boston_df[\"district-id\"].unique()\n", "print(f'\\nThere are {len(districts)} unique school districts.\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Which of these districts are close to both work and downtown boston?" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There are 116 school districts within reasonable proximity to downtown and work.\n", "\n", "There are 820 schools within these proximal districts.\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state-idnametypelevel-codeslevelcitystatezipcountylatlondistrict-namedistrict-idratingyearcoordinatesdistance-to-downtowndistance-to-work
universal-id
2500363380013Spofford Pondpublice3,4,5,6BoxfordMA1921Essex County42.697018-71.017365Boxford School District1027.02021.0(42.697018, -71.017365)22.91793319.554889
2506356100305Gibbs Schoolpublice,m6ArlingtonMA2474Middlesex County42.410576-71.145081Arlington Public Schools697.02021.0(42.410576, -71.145081)4.7949587.066929
25018353470410Daniel L Joyce Middle Schoolpublicm6,7,8WoburnMA1801Middlesex County42.477467-71.175484Woburn School District4674.02021.0(42.477467, -71.175484)9.2649224.013598
25017143150005Claypit Hill SchoolpubliceKG,1,2,3,4,5WaylandMA1778Middlesex County42.373108-71.344765Wayland School District4348.02021.0(42.373108, -71.344765)13.9527918.347379
25026311810055Tenney Grammar Schoolpublicp,e,mPK,KG,1,2,3,4,5,6,7,8MethuenMA1844Essex County42.732357-71.177345Methuen School District2703.02021.0(42.732357, -71.177345)25.76324318.273064
2500515710505Danvers High Schoolpublich9,10,11,12,UGDanversMA1923Essex County42.582523-70.931618Danvers School District1416.02021.0(42.582523, -70.931618)16.46450318.045917
25014982740410Next Wave Junior High Schoolpublicm7,8SomervilleMA2145Middlesex County42.387581-71.087326Somerville School District383NaNNaN(42.387581, -71.087326)1.60930810.378716
25013842430310Broad Meadows Middle Schoolpublicm6,7,8QuincyMA2169Norfolk County42.259659-70.985237Quincy School District3494.02021.0(42.259659, -70.985237)8.64600320.169491
25009161570006Hanscom Primary Schoolpublicp,ePK,KG,1,2,3Hanscom Air Force BsMA1731Middlesex County42.456898-71.278549Lincoln School District2423.02021.0(42.456898, -71.278549)12.2344631.705602
25017883360065Lawrence W PingreepubliceKG,1,2,3,4WeymouthMA2189Norfolk County42.217670-70.925240Weymouth School District4558.02021.0(42.21767, -70.92524)12.75463924.381842
\n", "
" ], "text/plain": [ " state-id name type level-codes \\\n", "universal-id \n", "2500363 380013 Spofford Pond public e \n", "2506356 100305 Gibbs School public e,m \n", "2501835 3470410 Daniel L Joyce Middle School public m \n", "2501714 3150005 Claypit Hill School public e \n", "2502631 1810055 Tenney Grammar School public p,e,m \n", "2500515 710505 Danvers High School public h \n", "2501498 2740410 Next Wave Junior High School public m \n", "2501384 2430310 Broad Meadows Middle School public m \n", "2500916 1570006 Hanscom Primary School public p,e \n", "2501788 3360065 Lawrence W Pingree public e \n", "\n", " level city state zip \\\n", "universal-id \n", "2500363 3,4,5,6 Boxford MA 1921 \n", "2506356 6 Arlington MA 2474 \n", "2501835 6,7,8 Woburn MA 1801 \n", "2501714 KG,1,2,3,4,5 Wayland MA 1778 \n", "2502631 PK,KG,1,2,3,4,5,6,7,8 Methuen MA 1844 \n", "2500515 9,10,11,12,UG Danvers MA 1923 \n", "2501498 7,8 Somerville MA 2145 \n", "2501384 6,7,8 Quincy MA 2169 \n", "2500916 PK,KG,1,2,3 Hanscom Air Force Bs MA 1731 \n", "2501788 KG,1,2,3,4 Weymouth MA 2189 \n", "\n", " county lat lon \\\n", "universal-id \n", "2500363 Essex County 42.697018 -71.017365 \n", "2506356 Middlesex County 42.410576 -71.145081 \n", "2501835 Middlesex County 42.477467 -71.175484 \n", "2501714 Middlesex County 42.373108 -71.344765 \n", "2502631 Essex County 42.732357 -71.177345 \n", "2500515 Essex County 42.582523 -70.931618 \n", "2501498 Middlesex County 42.387581 -71.087326 \n", "2501384 Norfolk County 42.259659 -70.985237 \n", "2500916 Middlesex County 42.456898 -71.278549 \n", "2501788 Norfolk County 42.217670 -70.925240 \n", "\n", " district-name district-id rating year \\\n", "universal-id \n", "2500363 Boxford School District 102 7.0 2021.0 \n", "2506356 Arlington Public Schools 69 7.0 2021.0 \n", "2501835 Woburn School District 467 4.0 2021.0 \n", "2501714 Wayland School District 434 8.0 2021.0 \n", "2502631 Methuen School District 270 3.0 2021.0 \n", "2500515 Danvers School District 141 6.0 2021.0 \n", "2501498 Somerville School District 383 NaN NaN \n", "2501384 Quincy School District 349 4.0 2021.0 \n", "2500916 Lincoln School District 242 3.0 2021.0 \n", "2501788 Weymouth School District 455 8.0 2021.0 \n", "\n", " coordinates distance-to-downtown distance-to-work \n", "universal-id \n", "2500363 (42.697018, -71.017365) 22.917933 19.554889 \n", "2506356 (42.410576, -71.145081) 4.794958 7.066929 \n", "2501835 (42.477467, -71.175484) 9.264922 4.013598 \n", "2501714 (42.373108, -71.344765) 13.952791 8.347379 \n", "2502631 (42.732357, -71.177345) 25.763243 18.273064 \n", "2500515 (42.582523, -70.931618) 16.464503 18.045917 \n", "2501498 (42.387581, -71.087326) 1.609308 10.378716 \n", "2501384 (42.259659, -70.985237) 8.646003 20.169491 \n", "2500916 (42.456898, -71.278549) 12.234463 1.705602 \n", "2501788 (42.21767, -70.92524) 12.754639 24.381842 " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate distance to PoI using geo-center of districts\n", "distances_to_downtown = {k: np.mean(list(v)) for k, v in boston_df.groupby('district-id')['distance-to-downtown']}\n", "distances_to_work = {k: np.mean(list(v)) for k, v in boston_df.groupby('district-id')['distance-to-work']}\n", "\n", "df_downtown = pd.DataFrame.from_dict(distances_to_downtown, orient='index')\n", "df_work = pd.DataFrame.from_dict(distances_to_work, orient='index')\n", "\n", "# merge these new columns\n", "both_df = pd.merge(left=df_downtown, right=df_work, how='inner', left_index=True, right_index=True)\n", "both_df.rename(columns={'0_x': \"downtown\", '0_y': \"work\"}, inplace=True)\n", "\n", "both_df = both_df[both_df[\"downtown\"] < 35.0]\n", "both_df = both_df[both_df[\"work\"] < 25.0]\n", "\n", "print(f'There are {len(both_df)} school districts within reasonable proximity to downtown and work.\\n')\n", "\n", "# filter out all schools which aren't in proximal districts\n", "proximal_district_ids = list(both_df.index)\n", "boston_df = boston_df[boston_df['district-id'].isin(proximal_district_ids)]\n", "\n", "print(f'There are {len(boston_df)} schools within these proximal districts.\\n')\n", "\n", "boston_df.sample(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Some of these districts don't have enough rating data. Those should be dropped." ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "boston_df.groupby(['district-id'])" ] } ], "metadata": { "interpreter": { "hash": "4fc861b332db140b7b363b167627eee6a3238262e7c99e0237067fec0875fee7" }, "kernelspec": { "display_name": "Python 3.8.10 ('venv': venv)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }