In [1]:
# imports
import pandas as pd
import numpy as np

from great_schools import get_nearby_schools
from distance import get_distance
from secret import get_key
from district_score import get_overall_rating

## Shaun and Daniela's Boston Public School Analysis
#### 2021.04.10

Fetch the API key from the local filesystem.

In [2]:
# get the API key
api_key_file = '../keys/api.key'
api_key = get_key(api_key_file)

Use the `nearby_schools` API endpoint to grab raw data of all schools within the maximum radius

In [3]:
# Some columns will dropped immediately as pre-processing.
drops = [
    'nces-id',
    'school-summary',
    'street',
    'fipscounty',
    'phone',
    'fax',
    'web-site',
    'overview-url',
    'rating-description',
    'distance',
]

# Grab data for Boston.
refresh = False
boston_nearby_schools_file = '../data/nearby_schools/boston.csv'
if refresh:
    boston_schools = get_nearby_schools(api_key,"42.3","-71.2","50")
    boston_df = pd.DataFrame.from_dict(boston_schools)
    boston_df.drop(columns=drops,inplace=True)
    boston_df.to_csv(boston_nearby_schools_file, )
else:
    boston_df = pd.read_csv(boston_nearby_schools_file)
    boston_df.set_index(keys=["universal-id"], drop=True, inplace=True)
    boston_df.drop(columns=["Unnamed: 0"], inplace=True)

# Grab data for Buffalo.
refresh = False
buffalo_nearby_schools_file = '../data/nearby_schools/buffalo.csv'
if refresh:
    buffalo_schools = get_nearby_schools(api_key,"42.9625","-78.7425","50")
    buffalo_df = pd.DataFrame.from_dict(buffalo_schools)
    buffalo_df.drop(columns=drops,inplace=True)
    buffalo_df.to_csv(buffalo_nearby_schools_file)
else:
    buffalo_df = pd.read_csv(buffalo_nearby_schools_file)
    buffalo_df.set_index(keys=["universal-id"], drop=True, inplace=True)
    buffalo_df.drop(columns=["Unnamed: 0"], inplace=True)

Process the `lat` and `lon` columns from the API output into tuples.

Then create two new columns:
- Distance to Downtown
- Distance to Work

In [4]:
# Form tuple to represent coordinates
boston_df['coordinates'] = list(zip(boston_df.lat,boston_df.lon))
#boston_df.drop(columns=['lat', 'lon'], inplace=True)

# Define coordinates of important places
downtown=(42.3674836866797, -71.07134540735377) # Science Museum
work=(42.47381059540949, -71.25414135292398) # Hartwell

# Create new columns to tabulate distance to these important places
boston_df['distance-to-downtown'] = boston_df['coordinates'].apply(func=get_distance,p2=downtown)
boston_df['distance-to-work'] = boston_df['coordinates'].apply(func=get_distance,p2=work)

We should definitely removal all schools that aren't in Massachusetts.

In [5]:
print(f'There are {len(boston_df)} schools from the original API results.')

# only allow from MA
boston_df = boston_df[boston_df['state'] == "MA"]
print(f'Allowing only schools from Massachusetts reduces the dataset to {len(boston_df)} schools.')

There are 1789 schools from the original API results.
Allowing only schools from Massachusetts reduces the dataset to 1375 schools.


How many unique district id's are there?

In [6]:
# get unique districts
districts = boston_df["district-id"].unique()
print(f'\nThere are {len(districts)} unique school districts.\n')


There are 230 unique school districts.



Which of these districts are close to both work and downtown boston?

In [7]:
# calculate distance to PoI using geo-center of districts
distances_to_downtown = {k: np.mean(list(v)) for k, v in boston_df.groupby('district-id')['distance-to-downtown']}
distances_to_work = {k: np.mean(list(v)) for k, v in boston_df.groupby('district-id')['distance-to-work']}

df_downtown = pd.DataFrame.from_dict(distances_to_downtown, orient='index')
df_work = pd.DataFrame.from_dict(distances_to_work, orient='index')

# merge these new columns
both_df = pd.merge(left=df_downtown, right=df_work, how='inner', left_index=True, right_index=True)
both_df.rename(columns={'0_x': "downtown", '0_y': "work"}, inplace=True)

both_df = both_df[both_df["downtown"] < 35.0]
both_df = both_df[both_df["work"] < 20.0]

print(f'\nThere are {len(both_df)} school districts within reasonable proximity to downtown and work.\n')

# filter out all schools which aren't in proximal districts
proximal_district_ids = list(both_df.index)
boston_df = boston_df[boston_df['district-id'].isin(proximal_district_ids)]

print(f'There are {len(boston_df)} schools within these proximal districts.\n')


There are 90 school districts within reasonable proximity to downtown and work.

There are 699 schools within these proximal districts.



Let's drop any districts that have an average rating below the school population mean.

In [8]:
# get the mean rating from the entire population of schools
mean_rating = boston_df['rating'].mean()
std_rating = boston_df['rating'].std()

print(f'\nOf the remaining {len(boston_df)} schools, the average rating is {mean_rating}.')

# compute the average rating for each district
ave_ratings = {k: np.mean(v) for k, v in boston_df.groupby(by='district-id')['rating']}

# keep only districts that are above the population mean
not_low_performing = [k for k, v in ave_ratings.items() if v > mean_rating]
boston_df = boston_df[boston_df['district-id'].isin(not_low_performing)]

districts = sorted(list(boston_df['district-name'].unique()))
print(f'\nThere are {len(districts)} districts remaining after pruning districts whose collective average is below the population mean rating.\n')
print(f'Which are, {districts}')


Of the remaining 699 schools, the average rating is 5.664546899841017.

There are 56 districts remaining after pruning districts whose collective average is below the population mean rating.

Which are, ['Acton-Boxborough School District', 'Andover School District', 'Arlington Public Schools', 'Ashland School District', 'Assabet Valley Regional Vocational Technical School District', 'Bedford School District', 'Belmont School District', 'Billerica School District', 'Boxford School District', 'Brookline School District', 'Burlington School District', 'Cambridge School District', 'Carlisle School District', 'Chelmsford School District', 'Concord School District', 'Concord-Carlisle School District', 'Dover School District', 'Dover-Sherborn School District', 'Dracut School District', 'Essex North Shore Agricultural and Technical School District', 'Groton-Dunstable School District', 'Harvard School District', 'Lexington School District', 'Lincoln-Sudbury School District', 'Littleton School 

In [9]:
boston_df.set_index(['district-name','name'],inplace=True)
boston_df.sort_index(inplace=True)
boston_df

Unnamed: 0_level_0,Unnamed: 1_level_0,state-id,type,level-codes,level,city,state,zip,county,lat,lon,district-id,rating,year,coordinates,distance-to-downtown,distance-to-work
district-name,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Acton-Boxborough School District,Acton-Boxborough Regional High School,6000505,public,h,"9,10,11,12,UG",Acton,MA,1720,Middlesex County,42.479694,-71.458084,59,9.0,2021.0,"(42.479694, -71.458084)",21.179084,10.393596
Acton-Boxborough School District,Blanchard Memorial School,6000005,public,e,"KG,1,2,3,4,5,6",Boxborough,MA,1719,Middlesex County,42.482014,-71.505814,59,8.0,2021.0,"(42.482014, -71.505814)",23.513033,12.828547
Acton-Boxborough School District,C.T. Douglas Elementary School,6000020,public,e,"KG,1,2,3,4,5,6",Acton,MA,1720,Middlesex County,42.481873,-71.471588,59,6.0,2021.0,"(42.481873, -71.471588)",21.874635,11.087128
Acton-Boxborough School District,Luther Conant School,6000030,public,e,"KG,1,2,3,4,5,6",Acton,MA,1720,Middlesex County,42.475239,-71.436340,59,8.0,2021.0,"(42.475239, -71.43634)",20.036107,9.279210
Acton-Boxborough School District,Mccarthy-Towne School,6000015,public,e,"KG,1,2,3,4,5,6",Acton,MA,1720,Middlesex County,42.476936,-71.453590,59,5.0,2021.0,"(42.476936, -71.45359)",20.896860,10.159317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Winchester School District,Lynch Elementary School,3440020,public,"p,e","PK,KG,1,2,3,4,5",Winchester,MA,1890,Middlesex County,42.460964,-71.150330,464,7.0,2021.0,"(42.460964, -71.15033)",7.607210,5.361206
Winchester School District,Mccall Middle School,3440305,public,m,678,Winchester,MA,1890,Middlesex County,42.449566,-71.134735,464,7.0,2021.0,"(42.449566, -71.134735)",6.523996,6.308317
Winchester School District,Muraco Elementary School,3440040,public,e,"KG,1,2,3,4,5",Winchester,MA,1890,Middlesex County,42.463272,-71.131409,464,9.0,2021.0,"(42.463272, -71.131409)",7.288029,6.293099
Winchester School District,Vinson-Owen Elementary School,3440025,public,"p,e","PK,KG,1,2,3,4,5",Winchester,MA,1890,Middlesex County,42.449741,-71.175018,464,9.0,2021.0,"(42.449741, -71.175018)",7.758212,4.359471


In [10]:
print(boston_df['level-codes'].unique()) 

['h' 'e' 'm' 'e,m' 'p,h' 'p,e' 'p,e,m' 'p,e,h' 'm,h']


In [11]:
boston_df.loc["Boxford School District"]

Unnamed: 0_level_0,state-id,type,level-codes,level,city,state,zip,county,lat,lon,district-id,rating,year,coordinates,distance-to-downtown,distance-to-work
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Harry Lee Cole,380005,public,"p,e","PK,KG,1,2",Boxford,MA,1921,Essex County,42.660408,-71.00177,102,,,"(42.660408, -71.00177)",20.532659,18.184645
Spofford Pond,380013,public,e,3456,Boxford,MA,1921,Essex County,42.697018,-71.017365,102,7.0,2021.0,"(42.697018, -71.017365)",22.917933,19.554889


In [12]:
get_overall_rating(boston_df)

ValueError: If using all scalar values, you must pass an index