-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathclean_survey_data.py
More file actions
62 lines (51 loc) · 2.09 KB
/
clean_survey_data.py
File metadata and controls
62 lines (51 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 29 11:50:26 2016
@author: sglyon
"""
#%% read data
import pandas as pd
url1 = "https://raw.githubusercontent.com/NYUDataBootcamp/"
url2 = "Materials/master/Data/fall16_ug_pace_raw.csv"
url = url1 + url2
df = pd.read_csv(url)
#%% rename columns
df.columns = ["time", "experience", "pace", "help"]
#%% clean up dates
df["time"] = pd.to_datetime(df["time"])
#%% make experience column 0, 1 indicator
exp = df["experience"].copy()
exp[exp == "No"] = 0
exp[exp == "Yes"] = 1
df["experience"] = exp.astype(float)
#%% Make pace column 0, 1, 2, categorical
df["pace"] = df["pace"].astype("category")
#%% Split multiple response column
# get list of all topics people wanted help with
help_str = df["help"].str
help_list = help_str.split(r";").tolist()
topics = set()
for response in help_list:
if type(response) == list:
for topic in response:
topics.add(topic.strip())
new_names = {'Conditionals (if/else) and comparisons (<, >, ==, etc.) -- from python fundamentals 2': 'conditionals',
'DataFrame properties and methods -- from Pandas 1: Data input': 'dataframe_methods',
'Defining our own functions -- from python fundamentals 2': 'functions',
'Dictionaries -- from python fundamentals 2': 'dictionaries',
'Importing packages -- from Pandas 1: Data input': 'importing',
'List comprehensions (for inside square brackets) -- from python fundamentals 2': 'list_comprehensions',
'Loops (for) --- from python fundamentals 2': 'loops',
'Objects and methods -- from python fundamentals 1': 'objects',
'Reading data from internet -- from Pandas 1: Data input': 'internet_data',
'Slicing (square brackets) -- from python fundamentals 2': 'slicing',
'Working with variables in DataFrames -- from Pandas 1: Data input': 'df_variables'}
# make one column per topic, and add 0 or 1 for each response
for topic in topics:
# keep only first word
colname = new_names[topic]
df[colname] = 0
has_topic = help_str.contains(topic, regex=False)
df.loc[has_topic.fillna(False), colname] = 1
#%% drop old help column
df.drop(["help"], axis=1, inplace=True)