We'll first load the data and convert it to a data.frame.
library(rjson)
data <- fromJSON(file = "data/data.json")[[1]]
names <- sort(unique(unlist(lapply(data, names))))
ordDat <- list()
for (i in 1:length(data)) {
thisDat <- data[[i]][names]
thisDat[sapply(thisDat, is.null)] <- FALSE
names(thisDat) <- names
ordDat[[i]] <- thisDat
}
dat <- do.call(rbind.data.frame, ordDat)
Since we're treating each questions independently, we won't lose any information if we shuffle the data around for privacy's sake…
for (i in 1:10000) {
col <- floor(runif(1, min = 1, max = ncol(dat) + 1))
rows <- floor(runif(2, min = 1, max = nrow(dat) + 1))
# swap these two answers in this column
buffer <- dat[rows[2], col]
dat[rows[2], col] <- dat[rows[1], col]
dat[rows[1], col] <- buffer
}
We'll then want to update the columns to actually contain useful information rather than a numeric representation of the selection that was made.
calcLevels <- function(column, levels) {
# indices <- match(levels(column), 1:length(levels))
suppressWarnings(vals <- as.integer(as.character(column)))
column <- levels[vals]
levels(column) <- levels
column
}
#' Looks for columns starting with '<title>_' in the columns of the data
#' frame then aggregates them together
mergeColumns <- function(title, data) {
}
In total, we received 42 responses to the survey. We can analyze each below.
dat$zip <- suppressWarnings(as.integer(as.character(dat$zip)))
table(dat$zip)
##
## 75002 75007 75010 75013 75019 75023 75034 75038 75050 75063 75071 75075
## 1 1 1 1 2 1 1 1 1 1 1 1
## 75082 75089 75098 75202 75204 75205 75209 75214 75229 75244 75251 75287
## 1 2 1 1 1 1 1 1 1 1 3 3
## 76006 76092 76132 76248 95139
## 1 2 1 1 1
dat$distance <- calcLevels(dat$distance, c("0-10", "11-30", "31-50", "50+"))
barplot(table(dat$distance), xlab = "Distance in Miles")
dat$attended <- calcLevels(dat$attended, c("0", "1-2", "3-5", "5+"))
barplot(table(dat$attended), xlab = "Number Attended")
dat$frequency <- calcLevels(dat$frequency, c("Weekly", "Biweekly", "Monthly",
"Annually"))
barplot(table(dat$frequency))
dat$online <- calcLevels(dat$online, c("Yes", "No"))
barplot(table(dat$online))
dat$topics_1 <- calcLevels(as.integer(dat$topics_1), c("No", "Yes"))
barplot(table(dat$topics_1), main = "Job Advertisements")
dat$topics_2 <- calcLevels(as.integer(dat$topics_2), c("No", "Yes"))
barplot(table(dat$topics_2), main = "Networking")
dat$topics_3 <- calcLevels(as.integer(dat$topics_3), c("No", "Yes"))
barplot(table(dat$topics_3), main = "Lectures on R")
dat$topics_4 <- calcLevels(as.integer(dat$topics_4), c("No", "Yes"))
barplot(table(dat$topics_4), main = "Workshops or Hack-a-thons")
as.character(dat$quick_talk)[as.character(dat$quick_talk) != ""]
## [1] "use age of social media data through R"
## [2] "Integrating with larger systems and other languages. | performance of R verusus SAS, Python, Ruby, Java, etc. for certain tasks"
## [3] "Shiny, knitr, sweave"
## [4] "Hands on Tutorials on a variety of advanced r usage"
## [5] "Definition of basic vocabulary terms."
## [6] "creating and using your own functions"
## [7] "I'd be interested to see more talks on how people are using R in their domain."
## [8] "Nothing specific. Tips & Tricks would be nice"
## [9] "Domain-specific talks \"How I use R in marketing/biology/finance/etc.\""
## [10] "FALSE"
## [11] "FALSE"
## [12] "I'd be interested to see talks on how R is being used by others in their own projects."
## [13] "industrial trend, recent news update, product evaluation, R tips, R resources, book review, challenging topics, statistics tips, academic and business conferences and meeting information"
## [14] "New to R with 25 years experience in IT. Recently purchased a book to get started. I would like a list of items created by experienced R developers for things they wish someone had told them when they started."
## [15] "Random forests"
## [16] "Quick review of how to do something simple and cool in r (may be using an inbuilt function in R that is not found in other tools)."
as.character(dat$long_talk)[as.character(dat$long_talk) != ""]
## [1] "FALSE"
## [2] "formatting and printing quality output"
## [3] "More Big Data | Machine Learning with R"
## [4] "Objects & Object oriented programming in R"
## [5] "Creating charts | Survey of the top data mining techniques and when to use each"
## [6] "Handling large data sets, too big for memory. And techniques for rewriting SAS programs into R"
## [7] "Practical R applications."
## [8] "FALSE"
## [9] "Domain-specific talks \"How I use R in marketing/biology/finance/etc.\""
## [10] "GPU matrix computation; batch submission; internal data structures"
## [11] "New packages (similar to the ff package and shiny topics covered recently)."
## [12] "Social Media tracking - specifically using TwitteR"
## [13] "Demonstration of making R packages | R and Twitter perhaps using the twitteR package"
## [14] "PCA, various apply functions, comparison to other languages like Python"
## [15] "I am working on a SQL Server , R, an other BI tools to show how to combine and use each tool for its strengths. Still a work in progress "
## [16] "case study, project work, business application, problems to be solved, discuss a recent posts / blogs, statistics topic of the month and how to use it in R, guest speakers (keep them to be 90 minutes or shorter)"
## [17] "sentiment analysis"
## [18] "data structures | data management / manipulation | graphics | conditional and looping structures / syntax"
## [19] "big memory, ff"
## [20] "More statistical talks explaining the underlying concepts of data mining."
## [21] "Random forests"
dat$experience <- calcLevels(dat$experience, c("Never Used", "Beginner", "Intermediate",
"Advanced"))
barplot(table(dat$experience))
as.character(dat$get_started)[as.character(dat$get_started) != ""]
## [1] "The marketing research course. Introduced by professor"
## [2] "FALSE"
## [3] "looking for better visualizations"
## [4] "It is referenced in a lot of the papers I have been reading."
## [5] "Downloaded it, read online manual"
## [6] "To do what SAS will not do."
## [7] "GOt a copy of Revolution Analytics, grabbed a beer one Saturday afternoon and started coding"
## [8] "A need to do data analysis and charting with a free tool."
## [9] "At work. For a target marketing project"
## [10] "Data competitions"
## [11] "FALSE"
## [12] "Bought books and downloaded Revolution R as a student"
## [13] "Was asked to learn for biostatistics projects."
## [14] "Econometric analysis"
## [15] "using it at my current job"
## [16] "Coursera course"
## [17] "Installed it and read \"Introduction to R\" "
## [18] "recommended by co-worker"
## [19] "It's slowly being introduced as a SAS alternative at my work"
## [20] "Project"
## [21] "Interest in BigData"
## [22] "using S many years ago"
## [23] "I learned R as an undergraduate in statistics classes. I frequently use R in my research."
## [24] "In school"
## [25] "Doing stats in graduate school."
## [26] "Wanted to introduce my students to a very nice open source statistical package."
## [27] "I am mostly a PERL programmer, and was looking for a tool that had statistical functions built into it so that i didn't have to reinvent them in PERL."
## [28] "I took a class on Coursera"
## [29] "I have a Exadata and Big Data Appliance to play with and started when I found out about the Big Data Connectors for R. Think this might be a growing field so I'm learning statistics and R."
## [30] "I haven't really. I'm reading a book."
as.character(dat$get_out_of)[as.character(dat$get_out_of) != ""]
## [1] "FALSE"
## [2] "A sense of the local R community, an idea of the R related issues that are important to the community and ideas for how Revo could be hepful."
## [3] "I'd like to meet other people and network, and also see how people are using R in their workplace."
## [4] "Useful examples, hands-on practice"
## [5] "Networking and learning more about how people use R."
## [6] "Learning how to use R better. Meeting local R users."
## [7] "better understand of how to utilize R "
## [8] "Proficiency with R, Hadoop and predictive modeling techniques."
## [9] " have more practical chances to work on real world problems."
## [10] "better understanding of when to use R and what approaches to take when analyzing a problem"
## [11] "Self-Motivate myself to do more with R | Learn"
## [12] "Networking opportunities"
## [13] "Collaboration and networking."
## [14] "Learn more about how R is used and interact with other users."
## [15] "More tutorials on various statistical techniques"
## [16] "Keep my learning of R alive. Share ways of doing things and learn about all the different places where it is applied. A potential future career path may come out these interactions."
## [17] "Deeper knowledge of R | Meet R Users"
## [18] "Learn!"
## [19] "Learn more about R Script's capabilities and related solutions and topics."
## [20] "FALSE"
## [21] "Training, meeting people"
## [22] "Improving my skills & networking"
## [23] "I would like to continue to advance my career without taking a management role. I enjoy programming and have heard the compensation is above average for R programmers and that it's not uncommon to obtain twice the pay a normal programmer receives. I would first of all like a compensation reality check from members who have been R programmers for at least 2 years. If this is true then I would like a mentor to guide me through the process of becoming an R programmer."
## [24] "An environment where fellow users can share tips, tricks, experiences, etc."
## [25] "More knowledge of open source statistical practice."
dat$environment_1 <- calcLevels(as.integer(dat$environment_1), c("Yes", "No"))
barplot(table(dat$environment_1), main = "RStudio")
dat$environment_2 <- calcLevels(as.integer(dat$environment_2), c("Yes", "No"))
barplot(table(dat$environment_2), main = "StatET")
dat$environment_3 <- calcLevels(as.integer(dat$environment_3), c("Yes", "No"))
barplot(table(dat$environment_3), main = "RCommander")
dat$environment_4 <- calcLevels(as.integer(dat$environment_4), c("Yes", "No"))
barplot(table(dat$environment_4), main = "Emacs/ESS")
dat$environment_5 <- calcLevels(as.integer(dat$environment_5), c("Yes", "No"))
barplot(table(dat$environment_5), main = "Revolution R")
dat$environment_6 <- calcLevels(as.integer(dat$environment_6), c("Yes", "No"))
barplot(table(dat$environment_6), main = "Text Editor")
dat$environment_7 <- calcLevels(as.integer(dat$environment_7), c("Yes", "No"))
barplot(table(dat$environment_7), main = "Other")
I suppose we won't mention to Revolution that they lost out to “Text Editor”…
as.character(dat$doing_with_r)[as.character(dat$doing_with_r) != ""]
## [1] "Data editing. Regression analysis."
## [2] "Web development, real-time intelligence"
## [3] "I would like to continue to advance my career as a programmer rather than taking a management role. With regard to one of the questions below I have an Associates Degree and have taken Business Intelligence courses from SMU and TCU."
## [4] "Finance and economics"
## [5] "Large Data set analysis. I've got 1/2 a pedabyte to play with. Very interested in analytics that lead to decisions. For example, machine learning."
## [6] "data analysis."
## [7] "learning"
## [8] "I have a hobby about anything with data."
## [9] "Machine learning. "
## [10] "Bayesian statistics, time series, machine learning"
## [11] "genomics, finance, market analysis"
## [12] "Mostly crunching numbers and automating reports. I often use R just to make statistical calculations from the raw data, and then feed the R results into PERL or a graphing program to create the final reports. (R has OK graphics, but I prefer other graphing tools for ease of use and technical capabilities.)"
## [13] "Data Analysis/Machine Learning"
## [14] "Developing an integrated system similar to What John Deer uses for the production forecasting. Free you mind the only boundary you have live there. I was raised with the belief that if you can think of it you can make it"
## [15] "Data analysis for airlines"
## [16] "data mining"
## [17] "Large-scale modeling of marketing channel attribution, along with other analytics related to online marketing."
## [18] "FALSE"
## [19] "Econometric Analysis"
## [20] "FALSE"
## [21] "finding better ways to analyze data and garner insights from them"
## [22] "Migrate workloads from SAS to R"
## [23] "Analyze experimental data"
dat$r_pkg <- calcLevels(as.integer(dat$r_pkg), c("Yes", "No", "NA"))
barplot(table(dat$r_pkg))
dat$content_1 <- calcLevels(as.integer(dat$content_1), c("Yes", "No"))
barplot(table(dat$content_1), main = "Data Viz")
dat$content_2 <- calcLevels(as.integer(dat$content_2), c("Yes", "No"))
barplot(table(dat$content_2), main = "Manipulating Data")
dat$content_3 <- calcLevels(as.integer(dat$content_3), c("Yes", "No"))
barplot(table(dat$content_3), main = "Enterprise Deployment")
dat$content_4 <- calcLevels(as.integer(dat$content_4), c("Yes", "No"))
barplot(table(dat$content_4), main = "Domain-Specific")
dat$interests_about_r_1 <- calcLevels(as.integer(dat$interests_about_r_1), c("Yes",
"No"))
barplot(table(dat$interests_about_r_1), main = "Open-Source")
dat$interests_about_r_2 <- calcLevels(as.integer(dat$interests_about_r_2), c("Yes",
"No"))
barplot(table(dat$interests_about_r_2), main = "Free")
dat$interests_about_r_3 <- calcLevels(as.integer(dat$interests_about_r_3), c("Yes",
"No"))
barplot(table(dat$interests_about_r_3), main = "Packages")
dat$interests_about_r_4 <- calcLevels(as.integer(dat$interests_about_r_4), c("Yes",
"No"))
barplot(table(dat$interests_about_r_4), main = "Cross-Platform")
dat$interests_about_r_5 <- calcLevels(as.integer(dat$interests_about_r_5), c("Yes",
"No"))
barplot(table(dat$interests_about_r_5), main = "Ease of Use")
dat$interests_about_r_6 <- calcLevels(as.integer(dat$interests_about_r_6), c("Yes",
"No"))
barplot(table(dat$interests_about_r_6), main = "Community")
as.character(dat$domain)[as.character(dat$domain) != ""]
## [1] "Marketing Analytics"
## [2] "Finance"
## [3] "Semiconductor Manufacturing"
## [4] "AIrlines & travel"
## [5] "Education"
## [6] "Academia (genetics and genomics)"
## [7] "government, defense"
## [8] "Pharmaceutical Distribution"
## [9] "Consulting"
## [10] "University"
## [11] "Finance"
## [12] "Analytics"
## [13] "Enginnering/Manufacturing/Research"
## [14] "R&D"
## [15] "Finance"
## [16] "marketing analytics"
## [17] "software development"
## [18] "FALSE"
## [19] "Consulting"
## [20] "marketing"
## [21] "Finance"
## [22] "Technology Consulting"
## [23] "it"
## [24] "Healthcare"
## [25] "Finance"
## [26] "Bioinformatics"
## [27] "Manufacturing Quality"
## [28] "Auto Finance"
## [29] "Engineering"
## [30] "Travel"
## [31] "telecom"
## [32] "Consumer Goods"
## [33] "Life science"
## [34] "FALSE"
## [35] "Healthcare"
## [36] "finance"
## [37] "Healthcare "
## [38] "Social Science / Survey Research"
as.character(dat$formal_training)[as.character(dat$formal_training) != ""]
## [1] "Political Science"
## [2] "Engineering, Mathematics"
## [3] "Distributed data intensive systems and software engineering"
## [4] "Mechanical Engineering, Electrical Engineering, Oracle DBA, networking, System Administration"
## [5] "Developer DBA / Data Warehouse / SQL Server Analysis Services - MDX & DMX"
## [6] "Financial Econometrics"
## [7] "FALSE"
## [8] "Marketing Analytics"
## [9] "FALSE"
## [10] "math, statistics"
## [11] "math, statistics, operations research"
## [12] "Computer"
## [13] "Mathematics, Computer Science, Operations Research"
## [14] "math"
## [15] "Business "
## [16] "Math/Stat, Finance"
## [17] "economics, programming"
## [18] "Statistics, Finance, Management Strategy"
## [19] "electronics"
## [20] "Chemical Engineering, Probability & Statistics, Software Development, Technical Writing"
## [21] "Statistics, programming, mathematical modeling"
## [22] "social sciences; data analysis"
## [23] "Statistics"
## [24] "Genetics"
## [25] "Economics, Management, IT, computer programming, Business Intelligence"
## [26] "Informatics, software development"
## [27] "anthropology, finance,programming, statistics"
## [28] "school"
## [29] "Engineering"
## [30] "Machine Learning"
## [31] "Physics, applied math"
## [32] "Computer science, bioinformatics, software development, IT"
## [33] "Computer science"
## [34] "Computer Science, Semiconductor manufacturing technology"
## [35] "molecular biology, genetics, genomics, statistics"
## [36] "Accredited Professional Statistician"
## [37] "Statistics and econometrics"
dat$statistics_1 <- calcLevels(as.integer(dat$statistics_1), c("Yes", "No"))
barplot(table(dat$statistics_1), main = "Probability Distributions")
dat$statistics_2 <- calcLevels(as.integer(dat$statistics_2), c("Yes", "No"))
barplot(table(dat$statistics_2), main = "Hypothesis Testing")
dat$statistics_3 <- calcLevels(as.integer(dat$statistics_3), c("Yes", "No"))
barplot(table(dat$statistics_3), main = "Regression Analysis")
dat$programming_1 <- calcLevels(as.integer(dat$programming_1), c("Yes", "No"))
barplot(table(dat$programming_1), main = "Data Structures")
dat$programming_2 <- calcLevels(as.integer(dat$programming_2), c("Yes", "No"))
barplot(table(dat$programming_2), main = "Databases")
dat$programming_3 <- calcLevels(as.integer(dat$programming_3), c("Yes", "No"))
barplot(table(dat$programming_3), main = "Lexical Scoping")
dat$programming_4 <- calcLevels(as.integer(dat$programming_4), c("Yes", "No"))
barplot(table(dat$programming_4), main = "Web Development")
dat$education <- calcLevels(dat$education, c("High School", "Bachelor's", "Master's",
"Ph.D."))
barplot(table(dat$education))
dat$income <- calcLevels(dat$income, c("<35k", "35k-55k", "55k-75k", "75k-100k",
"100k-150k", "150k+"))
barplot(table(dat$income))
dat$age <- calcLevels(dat$age, c("<18", "19-25", "26-34", "35-49", "50-69",
">70"))
barplot(table(dat$age))
50% of respondants are 35-49, 31% are 50-69, and the rest are 19-34.
dat$gender <- calcLevels(dat$gender, c("Male", "Female"))
barplot(table(dat$gender))
dat$race <- calcLevels(dat$race, c("Am. Ind.", "Asian", "Black", "Hisp.", "Pacific",
"White", "Multiple"))
barplot(table(dat$race))
Only a few languages were present and my code didn't work for languages which no one selected. Manually filtering below.
dat$language_1 <- calcLevels(as.integer(dat$language_1), c("Yes", "No"))
print("English")
## [1] "English"
table(dat$language_1)
##
## No Yes
## 11 31
dat$language_2 <- calcLevels(as.integer(dat$language_2), c("Yes", "No"))
print("Spanish")
## [1] "Spanish"
table(dat$language_2)
##
## No Yes
## 40 2
dat$language_10 <- calcLevels(as.integer(dat$language_10), c("Yes", "No"))
print("Hindi")
## [1] "Hindi"
table(dat$language_10)
##
## No Yes
## 39 3
dat$language_11 <- calcLevels(as.integer(dat$language_11), c("Yes", "No"))
print("Chinese Languages")
## [1] "Chinese Languages"
table(dat$language_11)
##
## No Yes
## 39 3
dat$language_12 <- calcLevels(as.integer(dat$language_12), c("Yes", "No"))
print("Other")
## [1] "Other"
table(dat$language_12)
##
## No Yes
## 41 1