import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_excel('dataset_proj.xlsx')


df.set_index("TestRequestID", inplace = True)


df = df.drop(columns=['Latitude', 'Longitude', 'ZipCode'])


df['HasReportFile'] = df['HasReportFile'].map({
    0: "No",
    1: "Yes"
})


df["BookingTime"] = df["DateBooked"] - df["DateRequested"]


def calculate_request_complexity(dfRow):
  score = 0
  # The weight of complexity for each task/comment was provided to me by the company:
  score += (dfRow['ExternalTasks'] * 10)
  score += (dfRow['InternalTasks'] * 5)
  score += (dfRow['B2BComments'] * 2)
  score += (dfRow['InternalComments'] * 2)
  score += (dfRow['ICComments'] * 1)
  return score


df['Complexity']=df.apply(calculate_request_complexity, axis=1)


df = df.drop(columns=['StatusID', 'FirstBooking', 'LastBooking', 'DateRequested', 'DateBooked'])


print(df.isnull().any())

StageID               False
StageName             False
Modality              False
ChannelSource         False
AppointmentDate        True
State                  True
Cancelations          False
InternalTasks         False
ExternalTasks         False
PriorAuthStatus       False
TimeToOrderEntry       True
TimeToA1Completion     True
InternalComments      False
B2BComments           False
ICComments            False
HasReportFile         False
BookingTime            True
Complexity            False
dtype: bool


print("AppointmentDate: ", len(df[df['AppointmentDate'].isnull() == True]))
print("State: ", len(df[df['State'].isnull() == True]))
print("TimeToOrderEntry: ", len(df[df['TimeToOrderEntry'].isnull() == True]))
print("TimeToA1Completion: ", len(df[df['TimeToA1Completion'].isnull() == True]))
print("BookingTime: ", len(df[df['BookingTime'].isnull() == True]))

AppointmentDate:  35840
State:  595
TimeToOrderEntry:  27951
TimeToA1Completion:  70925
BookingTime:  35838


df = df.drop(columns=['TimeToA1Completion', 'TimeToOrderEntry'])


df = df.dropna(subset=['State'])


df1 = df.drop(columns=['AppointmentDate', 'BookingTime'])


df2 = df[df['StageID'] >= 4]
# StageID of 4 = scheduled, StageID of 5 = completed


print(df1.isnull().any())
print(df2.isnull().any())

StageID             False
StageName           False
Modality            False
ChannelSource       False
State               False
Cancelations        False
InternalTasks       False
ExternalTasks       False
PriorAuthStatus     False
InternalComments    False
B2BComments         False
ICComments          False
HasReportFile       False
Complexity          False
dtype: bool
StageID             False
StageName           False
Modality            False
ChannelSource       False
AppointmentDate      True
State               False
Cancelations        False
InternalTasks       False
ExternalTasks       False
PriorAuthStatus     False
InternalComments    False
B2BComments         False
ICComments          False
HasReportFile       False
BookingTime          True
Complexity          False
dtype: bool


print("AppointmentDate: ", len(df2[df2['AppointmentDate'].isnull() == True]))
print("BookingTime: ", len(df2[df2['BookingTime'].isnull() == True]))

AppointmentDate:  3
BookingTime:  1


df2 = df2.dropna(subset=['BookingTime', 'AppointmentDate'])


print(df2.isnull().any())

StageID             False
StageName           False
Modality            False
ChannelSource       False
AppointmentDate     False
State               False
Cancelations        False
InternalTasks       False
ExternalTasks       False
PriorAuthStatus     False
InternalComments    False
B2BComments         False
ICComments          False
HasReportFile       False
BookingTime         False
Complexity          False
dtype: bool


df1[["State"]].describe()


percent = (df1['State'].value_counts()["NY"]) / (len(df1))
print(f"{percent:.2%}")

63.18%


state_counts = df1['State'].value_counts()
state_percentages = state_counts / (len(df1)) * 100


state_percentages.plot.bar(figsize=(20, 10), ylabel='Percentage of test requests', xlabel='State')

<Axes: xlabel='State', ylabel='Percentage of test requests'>


top_states = df[(df['State'] == 'NJ') | (df['State'] == 'NY') | (df['State'] == 'FL')]
pd.crosstab(df.StageID, top_states.State, normalize=True, margins=True)


df_scheduled_completed = df1[(df1["StageName"] == "Scheduled") | (df1["StageName"] == "Completed")]
print(f"{(len(df_scheduled_completed) / len(df1)):.2%}")

58.59%


best_case = df_scheduled_completed[df_scheduled_completed["HasReportFile"] == "Yes"]
print(f"{(len(best_case) / len(df1)):.2%}")

35.63%


counts = df_scheduled_completed['Modality'].value_counts()
counts.plot.bar(figsize=(20, 10), ylabel='Number of Scan Requests', xlabel='Type of Scan (Modality)')

<Axes: xlabel='Type of Scan (Modality)', ylabel='Number of Scan Requests'>


percent_ultrasound_success = len(df_scheduled_completed[df_scheduled_completed["Modality"] == "Ultrasound"])/len(df_scheduled_completed)
print(f"{percent_ultrasound_success:.2%}")

39.44%


percent_all_ultrasounds = len(df1[df1["Modality"] == "Ultrasound"])/len(df1)
print(f"{percent_all_ultrasounds:.2%}")

35.47%


dfx = df[['StageID', 'Cancelations']]
dfx.groupby('StageID').sum()


df['Complexity'].describe()

count    85326.000000
mean        32.751471
std         34.935679
min          2.000000
25%         12.000000
50%         22.000000
75%         40.000000
max        596.000000
Name: Complexity, dtype: float64


df_std = df[['Complexity', 'Cancelations', 'StageID']]
df_std = ((df_std - df_std.mean())/df_std.std())


df_std.plot.scatter(x="Complexity", y="Cancelations", c="StageID",
                     cmap="plasma", alpha=.5);


df2["BookingTimeInts"] = (df2["BookingTime"].dt.days +
 (df2["BookingTime"].dt.components["hours"]/24) +   # 24 hours per day
 (df2["BookingTime"].dt.components["minutes"]/1440) +   # 1440 minutes per day
 (df2["BookingTime"].dt.components["seconds"]))   # 86400 seconds per day


df2_std = df2[['Complexity', 'BookingTimeInts']]
df2_std = ((df2_std - df2_std.mean())/df2_std.std())


plt.scatter(df2_std["Complexity"], df2_std["BookingTimeInts"])

plt.xlabel("Complexity of Request")
plt.ylabel("Time Taken to Book (from Time Requested) (In Days)")

plt.show()


from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import DictVectorizer


# Define the training data and represent features as a list of dictionaries
features = ["State", "Modality", "Cancelations", "Complexity"]
x_train_dict = df1[features].to_dict(orient="records")
y_train = df1["StageID"]


# dummy encoding:
vec = DictVectorizer(sparse=False)
vec.fit(x_train_dict)
x_train = vec.transform(x_train_dict)

# scale the features:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_std = scaler.transform(x_train)


# Fit the 5-nearest neighbors model:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train_std, y_train)
y_train_pred = model.predict(x_train_std)
y_train_pred

array([0, 0, 0, ..., 0, 5, 5])


x_new_dict = [{
    "State": "NY",
    "Modality": "Ultrasound",
    "Cancelations": 0,
    "Complexity": 5   # recall 50th percentile of Complexity is ~22
}]
x_new = vec.transform(x_new_dict)
x_new_std = scaler.transform(x_new)

print(model.predict(x_new_std))
model.predict_proba(x_new_std)

[0]

array([[0.6, 0. , 0.4]])


def get_accuracy(k):
    model = KNeighborsClassifier(n_neighbors=k)
    pipeline = Pipeline([
    ("scaler", scaler),
    ("model", model)
])
    accuracy= cross_val_score(pipeline, x_train, y_train,cv=10, scoring="accuracy").mean()
    return accuracy


ks = pd.Series(range(1, 31))
ks.index = range(1, 31)
accuracies = ks.apply(get_accuracy)

accuracies.plot.line()
accuracies.sort_values()

1     0.551650
2     0.551755
4     0.596981
3     0.599513
5     0.618253
10    0.619143
11    0.620632
6     0.621300
7     0.622882
8     0.624136
12    0.624488
13    0.628168
15    0.628261
9     0.628706
23    0.629621
21    0.629773
17    0.630019
27    0.630523
22    0.630547
14    0.630605
25    0.630640
19    0.631004
26    0.631050
16    0.631590
29    0.631683
24    0.631976
20    0.632187
30    0.632222
28    0.632773
18    0.633395
dtype: float64


# Fit the 18-nearest neighbors model:
model = KNeighborsClassifier(n_neighbors=18)
model.fit(x_train_std, y_train)

# Use the model to predict on the same test point
print(model.predict(x_new_std))
model.predict_proba(x_new_std)

[0]

array([[0.83333333, 0.        , 0.16666667]])


x_new_dict_0 = [{
    "State": "NY",
    "Modality": "Ultrasound",
    "Cancelations": 0,
    "Complexity": 0
}]
x_new_0 = vec.transform(x_new_dict_0)
x_new_0_std = scaler.transform(x_new_0)

print(model.predict(x_new_0_std))
model.predict_proba(x_new_0_std)

[0]

array([[0.61111111, 0.        , 0.38888889]])


x_new_dict_12 = [{
    "State": "NY",
    "Modality": "Ultrasound",
    "Cancelations": 0,
    "Complexity": 12
}]
x_new_12 = vec.transform(x_new_dict_12)
x_new_12_std = scaler.transform(x_new_12)

print(model.predict(x_new_12_std))
model.predict_proba(x_new_12_std)

[5]

array([[0.38888889, 0.        , 0.61111111]])


x_new_dict_20 = [{
    "State": "NY",
    "Modality": "Ultrasound",
    "Cancelations": 0,
    "Complexity": 20
}]
x_new_20 = vec.transform(x_new_dict_20)
x_new_20_std = scaler.transform(x_new_20)

print(model.predict(x_new_20_std))
model.predict_proba(x_new_20_std)

[5]

array([[0., 0., 1.]])


prob_0 = (0.833 + 0.611 + 0.389 + 0) / 4
prob_5 = (0.167 + 0.389 + 0.611 + 1) / 4

print(prob_0, prob_5)

0.45825 0.54175


pipeline = Pipeline([
    ("scaler", scaler),
    ("model", model)
])


# Precision:
cross_val_score(pipeline, x_train, (y_train >=4),
                cv=10, scoring="precision").mean()

0.7173601228151975


# Recall:
cross_val_score(pipeline, x_train, (y_train >=4),
                cv=10, scoring="recall").mean()

0.6962038847769554


def get_kNN_pred(x_new, n):
    """Given new observation, returns n nearest neighbors prediction
    """
    dists = np.sqrt(((x_train - x_new)**2).sum(axis=1))
    inds_sorted = dists.sort_values().index[:(n)]
    return y_train.loc[inds_sorted].mean()

x_train = df2[["Complexity"]]
y_train = df2["BookingTimeInts"]

x_new = pd.DataFrame()
x_new["Complexity"] = np.arange(0, 600, 5)


y_pred_100 = x_new.apply(get_kNN_pred, axis=1, args=(100,))

# Predictive model for k=30 nearest neighbors:
y_pred_100.index = x_new["Complexity"]
df2.plot.scatter(x="Complexity", y="BookingTimeInts", color="black", alpha=.2)
y_pred_100.plot.line(color="blue")

<Axes: xlabel='Complexity', ylabel='BookingTimeInts'>


index_array = np.array(y_pred_100.index)
values_array = np.array(y_pred_100.values)


np.corrcoef(index_array, values_array)[0, 1]

0.9343630142678436


from sklearn.neighbors import KNeighborsRegressor

x_dict = df2[["Complexity"]].to_dict(orient="records")
y = df2["BookingTimeInts"]

def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    mae = np.mean(-cross_val_score(
        pipeline, x_dict, y,
        cv=10, scoring="neg_mean_absolute_error"
    ))
    return mae

ks = pd.Series(range(1, 101))
ks.index = range(1, 101)
test_errs = ks.apply(get_cv_error)

test_errs.plot.line()
test_errs.sort_values()

100    20.999402
99     21.022531
98     21.037882
97     21.054166
96     21.069855
         ...    
5      33.259823
4      34.549659
3      35.233311
2      36.569867
1      38.069530
Length: 100, dtype: float64


k = pd.Series(range(30000, 30001))
k.index = range(30000, 30001)
test_err = k.apply(get_cv_error)


print(test_err)

30000    19.721953
dtype: float64

State	FL	NJ	NY	All
StageID
0	0.114878	0.019356	0.268086	0.402320
4	0.003031	0.000076	0.025163	0.028271
5	0.162505	0.013600	0.393304	0.569409
All	0.280413	0.033033	0.686554	1.000000

	Cancelations
StageID
0	16995
4	636
5	8008

Influences on Medmo Booking Rates¶

Name: Zoe Birnbaum¶

Link to Github Pages: https://zbirnbaum.github.io/¶

Project Goals¶

The Data Set¶

Cleaning up the data¶

Preliminary Data Workings¶

The `State` variable:¶

"Successful" Requests¶

The `Modality` variable:¶

Cancelations¶

Request Complexity¶

Models Overview:¶

RQ1: will a request become successful?¶

RQ2: is there a correlation between complexity and time taken to book?¶

Conclusion¶

	State
count	85326
unique	50
top	NY
freq	53913

Influences on Medmo Booking Rates¶

Name: Zoe Birnbaum¶

Link to Github Pages: https://zbirnbaum.github.io/¶

Project Goals¶

The Data Set¶

Cleaning up the data¶

Preliminary Data Workings¶

The State variable:¶

"Successful" Requests¶

The Modality variable:¶

Cancelations¶

Request Complexity¶

Models Overview:¶

RQ1: will a request become successful?¶

RQ2: is there a correlation between complexity and time taken to book?¶

Conclusion¶

The `State` variable:¶

The `Modality` variable:¶