diff --git a/docs/search.json b/docs/search.json
index ff0c0b5..a46fbe5 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -161,326 +161,354 @@
"text": "harriet.mason1@monash.edu\n @marriethason\n @harriet-mason\n Interests:\n \n statistical computingdata visualisationdata storytellingvisual explanations"
},
{
- "objectID": "people/current/ohara-wild-mitch/index.html",
- "href": "people/current/ohara-wild-mitch/index.html",
- "title": "Mitch O’Hara-Wild",
+ "objectID": "people/current/vukcevic-damjan/index.html",
+ "href": "people/current/vukcevic-damjan/index.html",
+ "title": "Damjan Vukcevic",
"section": "",
- "text": "mail@mitchelloharawild.com\n https://www.mitchelloharawild.com/\n @mitchoharawild\n @mitchelloharawild\n Interests:\n \n statistical computingtime seriesforecastingsoftware design"
+ "text": "damjan.vukcevic@monash.edu\n http://damjan.vukcevic.net/\n @VukcevicD\n @dvukcevic\n \n Google scholar\n \n ORCiD\n \n LinkedIn\n Interests:\n \n applied statisticsbayesian inferencedata scienceelection auditingsequential analysisstatistical genomics"
},
{
- "objectID": "people/current/lydeamore-michael/index.html",
- "href": "people/current/lydeamore-michael/index.html",
- "title": "Michael Lydeamore",
+ "objectID": "people/current/hyndman-rob-j/index.html",
+ "href": "people/current/hyndman-rob-j/index.html",
+ "title": "Rob J Hyndman",
"section": "",
- "text": "michael.lydeamore@monash.edu\n https://research.monash.edu/en/persons/michael-lydeamore\n @MikeLydeamore\n @MikeLydeamore\n Interests:\n \n computational statisticsinfectious diseases modelling and epidemiologydata science"
+ "text": "Rob.Hyndman@monash.edu\n https://robjhyndman.com/\n @robjhyndman\n @robjhyndman\n \n Google scholar\n Interests:\n \n forecastingtime seriesexploratory data analysisanomaly detection"
},
{
- "objectID": "people/current/palihawadana-nuwani/index.html",
- "href": "people/current/palihawadana-nuwani/index.html",
- "title": "Nuwani Palihawadana",
+ "objectID": "people/current/fui-swen-kuh/index.html",
+ "href": "people/current/fui-swen-kuh/index.html",
+ "title": "Swen Kuh",
"section": "",
- "text": "nuwani.kodikarapalihawadana@monash.edu\n @nuwani-palihawadana\n Interests:\n \n forecastingtime series analysispredictive modelling"
+ "text": "swen.kuh@monash.edu\n @swenkuh\n @swenk238\n Interests:\n \n Social scienceHierarchical modellingBayesian inference"
},
{
- "objectID": "people/visitors/thomas.html",
- "href": "people/visitors/thomas.html",
- "title": "Thomas Lumley",
+ "objectID": "people/index.html#current",
+ "href": "people/index.html#current",
+ "title": "Meet the team",
+ "section": "Current",
+ "text": "Current\n \n \n \n \n \n \n \n \n \n \n Alexander Ek\n \n \n Postdoctoral Researcher in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Catherine Forbes\n \n \n Professor\n \n \n \n \n \n \n \n \n \n \n \n \n \n Cynthia Huang\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Damjan Vukcevic\n \n \n Associate Professor and Director of Engagement\n \n \n \n \n \n \n \n \n \n \n \n \n \n David Frazier\n \n \n Professor\n \n \n \n \n \n \n \n \n \n \n \n \n \n David Wu\n \n \n Postdoctoral Research Fellow\n \n \n \n \n \n \n \n \n \n \n \n \n \n Di Cook\n \n \n Professor of Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Didier Nibbering\n \n \n Senior Lecturer in Econometrics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Gael Martin\n \n \n Professor of Econometrics\n \n \n \n \n \n \n \n \n \n \n \n \n \n George Athanasopoulos\n \n \n Professor of Statistics and Head of Department\n \n \n \n \n \n \n \n \n \n \n \n \n \n Harriet Mason\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Jack Jewson\n \n \n Senior Lecturer in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Janith Wanniarachchi\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Jarryd Chapman\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Jessica Leung\n \n \n Lecturer in Business Analytics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Kate Saunders\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Klaus Ackermann\n \n \n Senior Lecturer in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Michael Lydeamore\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Mitch O’Hara-Wild\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Nuwani Palihawadana\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n P. G. Jayani Lakshika\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Patrick Li\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Rob J Hyndman\n \n \n Professor of Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Ruben Loaiza Maya\n \n \n Senior Lecturer in Econometrics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Swen Kuh\n \n \n Postdoctoral Researcher\n \n \n \n \n \n \n \n \n \n \n \n \n \n Tina Rashid Jafari\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Xiaoqian Wang\n \n \n Postdoctoral Researcher in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Yangzhuoran Fin Yang\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n No matching items"
+ },
+ {
+ "objectID": "people/index.html#visitors",
+ "href": "people/index.html#visitors",
+ "title": "Meet the team",
+ "section": "Visitors",
+ "text": "Visitors\n \n \n \n \n \n \n \n \n \n \n Galit Shmueli\n \n \n Professor\n \n \n \n \n \n \n \n \n \n \n \n \n \n Paulo Canas Rodrigues\n \n \n Professor\n \n \n \n \n \n \n \n \n \n \n \n \n \n Thomas Lumley\n \n \n Professor\n \n \n \n \n \n \n \n No matching items"
+ },
+ {
+ "objectID": "people/index.html#alumni",
+ "href": "people/index.html#alumni",
+ "title": "Meet the team",
+ "section": "Alumni",
+ "text": "Alumni\n \n \n \n \n \n \n \n \n \n \n \n \n \n Title\n \n \n Description\n \n \n \n \n \n \n \n \n \n Anastasios Panagiotelis\n \n \n Associate Professor\n \n \n \n \n \n \n \n Cameron Roach\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n Dan Simpson\n \n \n Professor of Analytics Engagement\n \n \n \n \n \n \n \n Earo Wang\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n Emi Tanaka\n \n \n Senior Lecturer in Statistics\n \n \n \n \n \n \n \n Fan Cheng\n \n \n Data Scientist\n \n \n \n \n \n \n \n Lauren Kennedy\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n Luis Torres\n \n \n Postdoctoral Researcher (Statistics)\n \n \n \n \n \n \n \n Mahdi Abolghasemi\n \n \n Post Doctoral Researcher (Data Science)\n \n \n \n \n \n \n \n Nathaniel Tomasetti\n \n \n Data Scientist\n \n \n \n \n \n \n \n Nicholas Spyrison\n \n \n PhD (Information Technology)\n \n \n \n \n \n \n \n Nick Tierney\n \n \n Research Software Engineer\n \n \n \n \n \n \n \n Pablo Montero Manso\n \n \n Post Doctoral Researcher (Statistics)\n \n \n \n \n \n \n \n Patricia Menendez\n \n \n Senior Lecturer in Statistics\n \n \n \n \n \n \n \n Priyanga Dilini Talagala\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n Puwasala Gamakumara\n \n \n Post Doctoral Researcher (Statistics)\n \n \n \n \n \n \n \n Ryan Thompson\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n Sayani Gupta\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n Sevvandi Kandanaarachchi\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n Sherry Zhang\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n Stephanie Kobakian\n \n \n Data Scientist\n \n \n \n \n \n \n \n Stuart Lee\n \n \n Postdoctoral Researcher (Statistics)\n \n \n \n \n \n \n \n Thiyanga Talagala\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n Ursula Laa\n \n \n Lecturer (Statistics)\n \n \n \n \n \n No matching items"
+ },
+ {
+ "objectID": "people/visitors/galit.html",
+ "href": "people/visitors/galit.html",
+ "title": "Galit Shmueli",
"section": "",
- "text": "t.lumley@auckland.ac.nz\n https://profiles.auckland.ac.nz/t-lumley\n \n Google scholar\n Interests:\n \n bioinformaticsmedical statisticsdesign of medical trialsstatistical computingsurvey statistics"
+ "text": "galit.shmueli@gmail.com\n https://www.galitshmueli.com\n \n Google scholar\n Interests:\n \n explain or predictstatistical strategybiosurveillanceonline auctionscount data modelsquality control"
},
{
- "objectID": "people/visitors/paulo.html",
- "href": "people/visitors/paulo.html",
- "title": "Paulo Canas Rodrigues",
+ "objectID": "about.html",
+ "href": "about.html",
+ "title": "Monash NUMBATs",
"section": "",
- "text": "paulocanas@gmail.com\n https://www.paulocanas.org\n \n Google scholar\n Interests:\n \n statistical learninghigh-dimensional dataforecastingtime seriesrobust statisticsdata visualization"
+ "text": "NUMBATs is the name for our research group, Non-Uniform Monash Business Analytics Team, in the Econometrics and Business Statistics Department. We meet regularly to openly exchange and discuss research ideas and activity in an actively supportive and creative environment. People outside the group are welcome to attend, and can sign up to receive notifications about topics for each meeting, or check the calendar on the web site.\nOur mission is to advance methodology for making data analysis easier. We develop new techniques for statistical modeling and visualisation, and work hard to make these available to the general public by distributing open source software, mostly using R.\nIt is also our goal to raise awareness of Australian fauna and flora, particularly many of the beautiful endangered species. For example, in reality, numbats are insectivores, mostly found in Western Australia, and we encourage you to contribute to the conservation efforts by donating or purchasing products from Project Numbat."
},
{
- "objectID": "img/FlexiblevsInflexible/flexinflex.html",
- "href": "img/FlexiblevsInflexible/flexinflex.html",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "objectID": "index.html",
+ "href": "index.html",
+ "title": "Non-Uniform Monash Business Analytics Team",
"section": "",
- "text": "A few years ago my mum became very susceptible to suggestion, and made friends with a guy who was really good at speaking about nonsense with the authority to make it sound true. Listening to him sent her down a conspiracy theory rabbit hole, of which I had to experience second hand. Our interactions ended up boiling down to mum sending me a 20 minutes Youtube video about aliens building the pyramids, then I would wait the appropriate amount of time and send a text that said “Wow, what an interesting perspective”. I always hoped it would end the conversation and we could talk about something else, but instead it tended to inspire a paragraph long text rant about how the government was hiding free energy from us, and an 11 year old Texan genius had discovered the plot. When I think of flexible methods, I often have flash backs to that period of my life. Not because high degree polynomials were built by ancient aliens or an 11 year old genius but because we can use the pitfalls of conspiracy theories to understand the difference between flexible and inflexible methods.\n \n\n\nI think of flexibility as the trade off in capturing the “local” and “global” trends in our data. An inflexible model will capture the global trend of the data, but any relationship between our variables is lost. If we instead choose a flexible model, we are focusing on the local trends and giving our model a better chance at capturing variable relationships, bit to overfit to the sample. Flexibility has key interactions with 4 other elements of our model: the sample size, dimensionality, assumptions about the function, and irreducible error."
+ "text": "Latest News\n\n\nWelcome to Professors Paulo Rodrigues and Professor Thomas Lumley, visiting us until June 2024, and Galit Schmueli visiting May 6-17.\nWelcome to new PhD students Jarryd Chapman, Tina Rashid Jafari 😄 We’re excited to welcome you!\nMonash Master of Business Analytics: Strap on an explorer’s backback and a skeptics hat and learn how to analyse data. Find out more.\nThe real numbats could use your help. If you would like to help with numbat conservation take a look at Project Numbat. You can buy numbat souvenirs or make donations to help with numbat conservation. Go to Project Numbat."
},
{
- "objectID": "img/FlexiblevsInflexible/flexinflex.html#a-conspiracy-theory-is-like-a-bad-model",
- "href": "img/FlexiblevsInflexible/flexinflex.html#a-conspiracy-theory-is-like-a-bad-model",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "objectID": "events.html",
+ "href": "events.html",
+ "title": "Events",
"section": "",
- "text": "A few years ago my mum became very susceptible to suggestion, and made friends with a guy who was really good at speaking about nonsense with the authority to make it sound true. Listening to him sent her down a conspiracy theory rabbit hole, of which I had to experience second hand. Our interactions ended up boiling down to mum sending me a 20 minutes Youtube video about aliens building the pyramids, then I would wait the appropriate amount of time and send a text that said “Wow, what an interesting perspective”. I always hoped it would end the conversation and we could talk about something else, but instead it tended to inspire a paragraph long text rant about how the government was hiding free energy from us, and an 11 year old Texan genius had discovered the plot. When I think of flexible methods, I often have flash backs to that period of my life. Not because high degree polynomials were built by ancient aliens or an 11 year old genius but because we can use the pitfalls of conspiracy theories to understand the difference between flexible and inflexible methods.\n \n\n\nI think of flexibility as the trade off in capturing the “local” and “global” trends in our data. An inflexible model will capture the global trend of the data, but any relationship between our variables is lost. If we instead choose a flexible model, we are focusing on the local trends and giving our model a better chance at capturing variable relationships, bit to overfit to the sample. Flexibility has key interactions with 4 other elements of our model: the sample size, dimensionality, assumptions about the function, and irreducible error."
+ "text": "We hold regular seminars during the semester period (March-June, August-Nov). And there are numerous other events organised or related to NUMBAT.\nYou can add the events to your calendar via iCal or Google calendar."
},
{
- "objectID": "img/FlexiblevsInflexible/flexinflex.html#outrageous-claims-need-outrageous-evidence",
- "href": "img/FlexiblevsInflexible/flexinflex.html#outrageous-claims-need-outrageous-evidence",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "1: Outrageous Claims Need Outrageous Evidence",
- "text": "1: Outrageous Claims Need Outrageous Evidence\nMy mother is a “bit eccentric” to put it mildly. In the last few months, to only name a few things, she has bought a fire truck to start mud-crabbing (pictured below), bought some goats because the garden is a pain to manage, and turned the pool into a “fish Club Med” where she collects wildlife from the local creek and feeds them McDonalds for breakfast. From expulsions to arrest warrants, to the man she drank goon with at the beach who now lives in our house, the stories are endless. Despite this, never in my life had I ever been called a liar for telling them (the first time was at university orientation). People at my school had grown used to it, they had met my family and heard years worth of stories so I had a wealth of evidence to normalise my claims. Strangers didn’t have that, and so they didn’t believe my outrageous (completely true) claims. Similarly in statistics, if we want a complicated model we will need a large sample size to back it up.\n\nWhy Flexible Models Need a Bigger Sample\nIn general, the larger your sample size, the more likely it is you have captured the “true relationship”. If you are increasing the number of parameters to estimate (not literally for non-parametric models but the idea carries on) without increasing the sample size, we are in effect decreasing the “sample size” for each of the predictions, and thus decreasing the reliability of our model. Placing more weight on all the observations in calculating our estimates, means we are increasing the influence of outliers and unrepresentative samples. We can either have observations contributing to a large area but averaged over many observations, or over a small area where our estimates are averages over fewer observations. For example, If we have 10 observations and predict using the average, each observation contributes to 1/10th of the prediction, but all have the same. If we use 1-Nearest Neighbour, each prediction is only backed up by a single observation (illustrated below), however it is highly tailored to any relationships that may be specific to . Highly flexible models can be, and sometimes are, the appropriate choice to model a relationship, we just need a large sample to justify it. Outrageous claims need outrageous evidence."
+ "objectID": "posts/secret-santa-2023/index.html",
+ "href": "posts/secret-santa-2023/index.html",
+ "title": "Secret Santa 2023",
+ "section": "",
+ "text": "Overview\nWrapped up the year in style at the NUMBATs end-of-year bash on Nov 22nd, 2023, at Clayton Campus! Secret Santa gifts exchanged, and we kicked back with a few interesting games for a fun-filled time!\n\n\n\nParticipants\n\n\n\n\n\nIncredible present of socks featuring numbats designs made by Di Cook’s mother"
},
{
- "objectID": "img/FlexiblevsInflexible/flexinflex.html#the-internet---deliverer-of-facts-and-local-cult-meet-ups",
- "href": "img/FlexiblevsInflexible/flexinflex.html#the-internet---deliverer-of-facts-and-local-cult-meet-ups",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "2: The Internet - Deliverer of Facts and Local Cult Meet Ups",
- "text": "2: The Internet - Deliverer of Facts and Local Cult Meet Ups\nThe introduction of the internet was the age of new information. Conspiracy theories were on their way out, now anyone can use their phone and find the facts in seconds. Or can they? What I unfortunately discovered when mum got involved with conspiracy theories, is that for every website with legitimate information, there are 50 that don’t. The sheer vastness of the internet means that whenever we expand our search for hidden truth, we are just as likely to discover falsities. This is a useful illustration in dimensionality.\n\nFlexible Models Are Hurt More By Additional Parameters\nDimensionality interacts with the flexible vs inflexible models in two ways. The first is that in some occasions adding dimensions can literally be seen as making the model more flexible. Think of adding a squared variable to a linear regression to make it quadratic, we have made the model more flexible by adding a dimension. The second way it interacts with our models, is by increasing the distance between observations, and thus increasing the domain of each variable. To get technical, each additional parameter makes the area each observation is responsible for increase exponentially. Just like how increasing flexibility increases the “weight” of observations by localising their impact on the model, dimensionality makes the total “area” bigger, and so it does a similar thing. Sometimes the relationship between our variables needs to be modeled with a highly flexible model, and so we need to keep this interaction between flexibility and dimension in mind so our model variance doesn’t get out of control."
+ "objectID": "posts/reducing-teaching-duplication-with-unilur/index.html",
+ "href": "posts/reducing-teaching-duplication-with-unilur/index.html",
+ "title": "Reducing duplication in teaching materials",
+ "section": "",
+ "text": "As a young, impressionable undergraduate, a Computer Science lecturer once told me:\nWe’ve all done it. Whether it’s hard-coding a time period and then changing dataset, or whether it’s hard-coding a population size in a model that’s quoted on national television, we all do our best to keep code clean to avoid these kinds of pitfalls.\nSo why, then, does it seem like so many of us have at least two copies of every tutorial and assignment? One with just the questions, and one with the solutions on it? I understand that both types of files are required, at least until generative AI makes us fully change our assessment, but the idea of having two identical questions in two different files makes me very nervous.\nIn fact, earlier this year while teaching a unit for the second time, one of my tutors pointed out that the solutions had a different set of numbers in the question compared to the ones students were answering. Unfortunately, this also materially changed the interpretation of the answer, and so I had to go through, re-issue solutions and re-mark a pile of assignments.\nAt EBS, a large portion of the content for our units are managed through RMarkdown, in a reasonably standardised format. As a sign of the times, every time I inherit a unit, I try to port material over to Quarto. It just feels like the right thing to do. Given the ability for both of these systems to output multiple formats on render, I started thinking about how to have one master question file that could output both the student question set, and the full solutions at the same time."
},
{
- "objectID": "img/FlexiblevsInflexible/flexinflex.html#capitalism---the-gateway-conspiracy-to-lizard-people",
- "href": "img/FlexiblevsInflexible/flexinflex.html#capitalism---the-gateway-conspiracy-to-lizard-people",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "3: Capitalism - The Gateway Conspiracy to Lizard People",
- "text": "3: Capitalism - The Gateway Conspiracy to Lizard People\nNobody suddenly wakes up in the morning, looks in the mirror and says to themselves “Yes, today is the day. Today is the day I start believing in the lizard overlords.” I believe the process is more nuanced than that. Just like the “SayNoToPeerPressue” acting troupe who’s dreams I got to watch die in the comfort of my high school gym, I’m about to push the idea of gateways. From my personal experience, the process of becoming involved in conspiracies looks a little something like this: \nMy point is that ideas that hinge on something already well established in society are easier to swallow than those that aren’t. That is not to say entirely new theories must be wrong, but rather that they are harder for people to immediately understand and they are also more likely to be too out there for the general population to get on board with. I think of parametric and non-parametric models in a very similar way to how people think of capitalism vs lizard people conspiracy theories.\n\nNon-Parametric Models Are Usually More Flexible, But Not Always\nParametric models construct our function by assuming its type, and then estimating the best model within this range. Non-parametric models do not make any assumptions about our model’s form, but rather try to fit to the general shape of the data. Parametric and Non-parametric does not directly translate to flexibility; they both have the potential to produce a very flexible or inflexible fit. For example, a constant polynomial and a K-NN model where K=N would both predict the average response (the most inflexible model we can get). Rather, just like dimensionality, non-parametric models can fall into the same pitfalls as flexibility, and so the limits of our dataset should be kept in mind. By their nature, non-parametric models are more susceptible to variance from changes in the sample, as the sample is the only thing the model is using to make its predictions. Therefore, they are more likely to overfitting than parametric models and are usually more difficult to interpret. These features mean that in general non-parametric models are more flexible, simply by their nature, however they are still have the potential to be inflexible."
+ "objectID": "posts/reducing-teaching-duplication-with-unilur/index.html#a-clunky-first-attempt",
+ "href": "posts/reducing-teaching-duplication-with-unilur/index.html#a-clunky-first-attempt",
+ "title": "Reducing duplication in teaching materials",
+ "section": "A clunky first attempt",
+ "text": "A clunky first attempt\nI spent a bit too much time starting at CSS and ad-hoc web development, even though the results are sometimes nice. When all you have is a hammer, everything looks like a CSS-based nail. Enter the chunk option:\n```{r, echo=solutions}\nlibrary(readr)\ncovid_cases <- readr::read_csv(\"https://docs.health.vic.gov.au/covid19/data/NCOV_cases_by_postcode_LGA.csv\")\n```\nThis is pretty straightforward. I set the variable solutions at the top of the file to be TRUE if I want solutions to be printed, and false otherwise. With a bit of fanangling, you could also pass this into params in the YAML at the top of the document, and then two calls to rmarkdown::render() with the relevant parameters would probably get you what you want. With a bit of styling:\n```{css}\n.solution {\n padding: 10px;\n border: 1px solid black;\n margin-bottom: 5px;\n}\n\n.solution { background-color: #F9E79F; }\n\n.solution::before {\n font-weight: bold;\n font-style: italic;\n}\n\n.solution::before { content: \"Solution\";}\n```\nyou can even make it look nice. It was a good first attempt, and saw me through for that semester. But, I still felt like there had to be a cleaner way, and something I could generalise to Quarto (which loses me the luxury of the rmarkdown::render() call)."
},
{
- "objectID": "img/FlexiblevsInflexible/flexinflex.html#there-are-always-going-to-be-loonies-on-the-internet",
- "href": "img/FlexiblevsInflexible/flexinflex.html#there-are-always-going-to-be-loonies-on-the-internet",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "4: There are Always Going to Be Loonies on the Internet",
- "text": "4: There are Always Going to Be Loonies on the Internet\nWe can all spend our entire lives trying to convince everyone on the internet that they are wrong, but at the end of the day, we live in a complicated world, with complicated people, and there are always going to be loonies on the internet. Rather than dreaming of a world where everyone knows everything all the time, the system should just be to manage the chaos. The important life skill to learn isn’t that everyone needs to be corrected, and to focus on the nutters, but rather enjoy the fact that the majority get most things right, most of the time. Socrates might disagree with my idea on majority votes but you win some you lose some.\n\nYou Will Always Have Irreducible Error and It’s Size Matters\nObviously we can never have a perfect prediction since we are working with random variables. We can make our models more flexible to try and account for as much of the error as we can, but if we do, we might end up missing the underlying system entirely. No matter how flexible our model is, we will never have perfection thanks to our irreducible error (an attempt at making one is illustrated below). The interaction between flexibility and irreducible error comes from its size. A large irreducible error means the general shape change more drastically between samples, while a small one means our samples will remain consistent. Just like dimensionality, assumptions about our model, and sample size, this is just something that needs to be kept in mind as it has a strong interaction with the flexibility of our model, and the error from variance."
+ "objectID": "posts/reducing-teaching-duplication-with-unilur/index.html#inheriting-a-solution",
+ "href": "posts/reducing-teaching-duplication-with-unilur/index.html#inheriting-a-solution",
+ "title": "Reducing duplication in teaching materials",
+ "section": "Inheriting a solution",
+ "text": "Inheriting a solution\nFor the first time this year, I am running a subject on my own. As I can’t really do anything independently in my life, I set about borrowing all the relevant templates for the LMS and converting handbook entries. At some point, I should share the Moodle template - it really is something, even though it isn’t mine!\nI was very fortunate. I inherited a very clean set of lecture notes from Emi Tanaka, that had even been ported to Quarto already. Bar some minor changes, I’ll teach this course as I got it in it’s first year, if for no other reason than so I can learn it fully!\nWhile in my pre-semester prep, I noticed that the tutorials are still Rmd files. Nothing else in the course is, so why these I wondered?\nIt turns out, in what shouldn’t be surprise, someone else had already engineered this master-copy-to-2-outputs solution:\n---\ntitle: 'ETC5523: Communicating with Data'\nsubtitle: \"Tutorial 1\"\nauthor: \"Michael Lydeamore\"\ndate: \"Week 1\"\noutput: \n unilur::tutorial_html_solution: \n toc: true\n suffix: \"-solution\"\n unilur::tutorial_html: \n toc: true\n suffix: \"-question\"\n---\nThe unilur package wsas designed for exactly this. I must say, it doesn’t look particularly active on GitHub, but nonetheless, it worked perfectly. With an extra argument to rmarkdown::render(output_format = \"all\"), two documents are produced, with the relevant suffixes on them. Brilliant!\nThe styling I inherited was a little egregious (I really can’t do white text on a green background), but it did the job. And no more multiple copies of files for me to royally screw up when I’m in a hurry.\nThe only problem is, it’s an Rmarkdown output format, not a Quarto one. And literally everything else in this course is Quarto. There’s even lectures dedicated to it. I stumbled upon a Quarto port, converting it to an extension, which was a great start. The system the author had gone with was to add a chunk option:\n```{r}\n#| unilur-solution: true\n```\ncombined with a YAML boolean show-solution: true, which you could change to generate the two files. I ported over the first tutorial reasonably quickly, and it performed almost as expected. I had nice expandable dropdowns for solutions:\n\nThe only problem was combining code chunks with text, which in a course all about how to explain your story, was quite important. This is because the only way to get a solution block to generate is with the chunk option, and so text solutions have to be enclosed in a block type. Embedding code chunks into a block… uh… block does work, but you lose syntax highlighting, and the ability to execute them if you want the solutions to actually compile.\nThe clunky solution to this is to have two solution blocks, and write the answer in such a way that the code comes after the text. I guess this is somewhat traditional, it’s certainly how I explained algorithms in my PhD thesis, but it is difficult for markers, and a bit jarring for students to have to jump around the page. Not quite as bad as figures at the end of a manuscript, but it has similar vibes. I don’t like it.\n\nDiving into the extension\nLike all Quarto extensions, the meat of what’s going on is in the _extensions/ folder. There’s a YAML file that gives defaults for what is added by the extension (which here is just a contributes for a LUA filter), some CSS, and a LUA filter. I can’t really write LUA, certainly not good LUA. My only experience of it is a terrible mod I made many years ago for Factorio, which as an aside is a brilliant game everyone should play.\nBut, I have enough CS experience and training to be able to read these things and clunk my way through them. Thankfully, this LUA filter isn’t particularly long. The Div function just checks if we’re in the right sort of thing (so a “cell”, and has the attribute unilur-solution == TRUE), and if it’s all good, spits out the solution. Otherwise, instead of returning the content (which would be el in this code), we return a totally empty element.\nThis means that extending the functionality to include a class is pretty easy. Just check if the class of the div (which for reasons unknown to me is always capitalised in LUA) is also called unilur-solution1:\n\nif (el.classes:includes(\"cell\") and el.attributes[\"unilur-solution\"] == \"true\") or (el.classes:includes(\"unilur-solution\")) then\n... do stuff ...\n\nThis worked a treat. Now I can use a Pandoc fenced div to specify solutions with text and code, and the code is highlighted and executed exactly as it would be on a normal block. The system works like this:\n\n::: unilur-solution\n\nHere is a solution that is inside a div. The contents of this will only be shown\nin the solution document.\n\n:::\n\nNow the last step: getting multiple documents to output from one master file, from one quarto render call.\n\n\nMultiple formats at once\nQuarto supports multiple format documents out-of-the-box, and for the most part, they work pretty well, minus a quirk with RStudio only rendering one of the formats. Use the CLI for that by the way, it works much cleaner.\nThere’s even support for formats with the same file extension so they don’t overwrite each other. Just add output-file to your YAML and you can generate two HTML files.\nSo this was looking pretty easy. I’ll just convert the extension to an output format, include it twice in the header of my tutorial and off we go. In case you ever need it, here’s how you can specify your own output format:\ntitle: Unilur Questions\nquarto-required: \">=1.3.0\"\ncontributes:\n format:\n common:\n filters:\n - unilur.lua\n show-solution: false\n html:\nYou set your document type as the folder your extension is in, plus the format you want. So in this case, I have:\nformat:\n unilur-question-html:\n output-file: tutorial-01-question.html\n embed-resources: true\nI forgot the -html the first time and caused myself a lot of pain.\nUnfortunately you can’t have the same output format twice, just changing the YAML options. I don’t understand why, I won’t pretend to understand why, and in the kindest way, I don’t think I really want to understand why.\nSo, sadly, the solution was to have two almost identical extensions: unilur-question, which defaults show-solution: false, and unilur-solution, which defaults to the opposite. Still two files, I guess, but it ends up being not too bad."
},
{
- "objectID": "img/FlexiblevsInflexible/flexinflex.html#to-conclude",
- "href": "img/FlexiblevsInflexible/flexinflex.html#to-conclude",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "To Conclude",
- "text": "To Conclude\nDon’t let your mum hang out with weirdos, and treat conspiracy theories and overly complicated models with scepticism."
+ "objectID": "posts/reducing-teaching-duplication-with-unilur/index.html#the-workflow",
+ "href": "posts/reducing-teaching-duplication-with-unilur/index.html#the-workflow",
+ "title": "Reducing duplication in teaching materials",
+ "section": "The workflow",
+ "text": "The workflow\nSo now, it’s pretty straightforward. You can use the extension like any other Quarto extension2:\nquarto install extension MikeLydeamore/unilur\nwhich will install both identical extensions. Set your YAML as per above, fence off your solutions using either the chunk option or the div, and quarto render your way to what is in my opinion, a much cleaner workflow.\nThe YAML header becomes:\nformat:\n unilur-question-html:\n output-file: tutorial-01-question.html\n embed-resources: true\n unilur-solution-html:\n output-file: tutorial-01-solution.html\n embed-resources: true\nwhich I think is pretty clean. It’s not quite the suffix tag that’s in the original extension, but I think I can live with that. If anyone knows a way to get the current name of the file in the YAML (or possibly elsewhere in extension-land), I’d love to hear it so I don’t need output-file anymore.\nIf you use this, or have other solutions, I’d love to hear about them. Please let me know however you see fit: Twitter, GitHub, or e-mail."
},
{
- "objectID": "projects.html",
- "href": "projects.html",
- "title": "Projects",
+ "objectID": "posts/reducing-teaching-duplication-with-unilur/index.html#footnotes",
+ "href": "posts/reducing-teaching-duplication-with-unilur/index.html#footnotes",
+ "title": "Reducing duplication in teaching materials",
+ "section": "Footnotes",
+ "text": "Footnotes\n\n\nQuarto (and Rmarkdown for that matter) doesn’t have a LUA highlighter so you’ll have to read this one yourself.↩︎\nPending an open pull request↩︎"
+ },
+ {
+ "objectID": "posts/election_hexmaps/index.html",
+ "href": "posts/election_hexmaps/index.html",
+ "title": "Hexmaps with sugarbag make it easier to see the electoral map",
"section": "",
- "text": "WOMBAT 2022\n \n\n \n Workshop Organised by the Monash Business Analytics Team 2022 Communicating with Data\n \n\n \n \n \n \n \n \n \n \n \n \n\n \n\n \n \n\n \n \n \n \n \n \n \n \n \n tidyverts\n \n\n \n R packages for tidy time series analysis\n \n\n \n \n \n \n \n \n \n \n \n \n\n \n\n \n \n\n \n \n \n \n \n \n \n \n \n Quarto\n \n\n \n Quarto resources designed for use at Monash University\n \n\n \n \n \n \n \n \n \n \n \n \n\n \n\n \n \n\n\n\n\nNo matching items"
+ "text": "Australia is a land of wide open spaces where the population concentrates in small areas. It can make for misleading map visualisations on statistics related to people. The May 20, 2022 ABC article The Australian election map has been lying to you explains this very neatly. It has alsp provided a better alternative to examine election results, in the form of a hexmap of Australia. The hexmap provided in the article is almost certainly manually constructed which is find for a construct once, use many times purpose.\nWhen you want to be able to make a hexmap on new spatial data or if the spatial groups change, the R package sugarbag can be helpful. This post explains how to do this, using the results as we have them today from yesterday’s election. (We’ll update these once the final results are released.)\nHere’s how to get started. Download the current spatial boundaries for electorates, from Australian Electoral Commission web site.\nLoad the libraries we need:\n\nlibrary(ggthemes)\nlibrary(sf)\nlibrary(sugarbag)\nlibrary(tidyverse)\nlibrary(plotly)\n\nRead in the spatial polygons, defining the boundaries. These files can be very large, and slow to draw. For these visualisations faster to draw is more important, so the boundaries can be simplified using rmapshaper::ms_simplify.\n\n# Spatial polygons\nelectorates <- sf::st_read(\"2021-Cwlth_electoral_boundaries_ESRI/2021_ELB_region.shp\")\nelectorates_small <- electorates %>% rmapshaper::ms_simplify(keep = 0.01, keep_shapes = TRUE)\n\nNext we need the election results. The ones here are manually constructed from the ABC results website. These results are joined to the map polygons, and colours are manually constructed to be one typically used by the party. The ggplotly() function enables labels to pop up on mouseover.\n\n# Read in data on current electoral results\nnew <- read_csv(\"electoral_2022.csv\") %>%\n select(Electorate:Party)\nnew_major <- new %>%\n mutate(Party_maj = fct_collapse(Party,\n LNP = c(\"LIB\", \"LNP\", \"NAT\")))\nelectorates_small <- electorates_small %>%\n left_join(new_major, by=c(\"Elect_div\"=\"Electorate\"))\nmap <- ggplot() +\n geom_sf(data=electorates_small,\n aes(fill = Party_maj,\n label=Elect_div),\n colour=\"white\") +\n scale_fill_manual(\"\", values=c(\"ALP\"=\"#E13940\",\n \"LNP\"=\"#1C4F9C\",\n \"GRN\"=\"#009C3D\",\n \"KAP\"=\"#906E3E\",\n \"CA\"=\"#FFC000\",\n \"IND\"=\"#66b2b2\",\n \"UNDEC\"=\"#808080\")) +\n theme_map()\nmap\n\n\n\n\n\n\n\n#ggplotly(map)\n\nAn interactive version can be found here.\nThe map is blue – it looks like the coalition won the election in a landslide, doesn’t it! (Please note the strange shape of the Cape of York is from the AEC spatial polygons provided! It is not due the the polygon thinning.)\nTo convert this into a hexmap, automatically with sugarbag, we need to\n\nFind the centroids of each polygon.\nCreate a hexagon grid with a desired size of hexagon, hs controls this.\nAllocate electorates to a spot on the grid.\nTurn the hexagon centroids into hexagons.\nJoin with election results.\nMake it interactive using ggplotly().\n\n\n# Find centroids of polygons\nsf_use_s2(FALSE)\ncentroids <- electorates %>%\n create_centroids(., \"Elect_div\")\n\n## Create hexagon grid\nhs <- 0.8\ngrid <- create_grid(centroids = centroids,\n hex_size = hs,\n buffer_dist = 5)\n\n## Allocate polygon centroids to hexagon grid points\nelectorate_hexmap <- allocate(\n centroids = centroids,\n hex_grid = grid,\n sf_id = \"Elect_div\",\n ## same column used in create_centroids\n hex_size = hs,\n ## same size used in create_grid\n hex_filter = 10,\n focal_points = capital_cities,\n width = 35,\n verbose = FALSE\n)\n\n# Make the hexagons\ne_hex <- fortify_hexagon(data = electorate_hexmap,\n sf_id = \"Elect_div\",\n hex_size = hs)\nelectorate_hexmap_new <- e_hex %>%\n left_join(new_major, by=c(\"Elect_div\"=\"Electorate\"))\nhexmap <- ggplot() +\n geom_sf(data=electorates_small,\n fill=\"grey90\", colour=\"white\") +\n geom_polygon(data=electorate_hexmap_new,\n aes(x=long, y=lat,\n group = hex_id,\n fill=Party_maj,\n label=Elect_div)) +\n scale_fill_manual(\"\", values=c(\"ALP\"=\"#E13940\",\n \"LNP\"=\"#1C4F9C\",\n \"GRN\"=\"#009C3D\",\n \"KAP\"=\"#906E3E\",\n \"CA\"=\"#FFC000\",\n \"IND\"=\"#66b2b2\",\n \"UNDEC\"=\"#808080\")) +\n theme_map()\nhexmap\n\n\n\n\n\n\n\n#ggplotly(hexmap)\n\nAn interactive version can be found here\nAnd that’s it! The sugarbag hexmap will expand the densely populated small areas outwards, while maintaining proximity to neighbouring electorates and to the city centre. It is a type of cartogram algorithm with two important differences: (1) uses equal area for each hexagon instead of sized proportional to population, and (2) allows some hexagons to be separated so that the geographic positions are reasonably preserved.\nThe hexmap makes it easier to see the results distributed across the country, and clearly with the predominance of red, that Labor won.\nData for this post can be found here.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
},
{
- "objectID": "posts/regularisation/index.html",
- "href": "posts/regularisation/index.html",
- "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
+ "objectID": "posts/hackathon_2024/index.html",
+ "href": "posts/hackathon_2024/index.html",
+ "title": "Hackathon 2024",
"section": "",
- "text": "Back when I lived with my sister I barely managed to drink once a month, and health wise, I was living a good life. Unfortunately, my sister decided to get engaged and her fiance was all “its weird that we live with your hermit sister” and “you two have been co-dependent for years its unhealthy and it’s time to stop”. When I moved in with my friends at the beginning of the year I was immediately tossed in to a long strict lock down. I had to deal with living in a house of people I liked, all of which had no schedule and were all bored out of their minds, so, to cut a long story short, we all started drinking a lot. I have since significantly cut back (I have gone back to my original “barely once a month” amount), but during our trial period for high functioning alcoholism, our friend Fynn introduced us to a “guess your uni grades” drinking game. Here is how it works:\n\nGuess your mark for all the units you are taking this semester\nWhen you get your results, calculate the difference between your guess and your actual result. e.g. if you guess 51 for Topology and actually get a 73 because the exam was scaled, you get 22 points.\nTake your points for each unit and sum them up. e.g. If topology got you 22 points, data structures 7 points, and a project unit was 3 points, your total is 32 points.\nIf you did not do a 4 unit work load, you scale your points up and round to the nearest integer to match that. e.g. if you had 32 points and did 3 units, your scaled score is 43.\nThe number of points you have is the number of drinks you have to do.\n\nFynn’s version is proper shots. Unfortunately because the example case was based on my housemate Tom, who apparently has next to no idea how he is doing in his courses, we had to change our rules to shots of mixed drinks. Even with this change we calculated that there was still a chance Tom would be killed by alcohol poisoning. After a 3-1 house vote we agreed we were willing to make that sacrifice for the sanctity of the game. My housemates in order of least to most points were:\n\nZac with 4 drinks\nMe with 13 drinks\nEm with 17 drinks\nTom with 43 drinks\n\n\n\n\nA visualisation of both the guessing and social order of the house. Here, Tom has died of alchohol poisoning.\n\n\nThis game led into a discussion about whose grades are the most difficult to predict. For example, there are things that seem to make guessing easier, such as completely in semester units. While things like my and Tom’s history of high variance puts us at a natural disadvantage. The first step to understanding what affects our changes in grades is to predict them. Figure 1 below gives a basic visualization of the house’s grades. The x axis represents the year and semester, however it is mostly arbitrary.\n\n\n\nThe house’s grades for every semester of uni.\n\n\nLooking at this plot we can visually see some of our personality quirks in the data. This plot makes it rather obvious which semester I had a mental breakdown and was diagnosed with ADHD. Em’s consistently high marks show the benefit of grade based anxiety, and the slight drop at the end shows the trade off that occurs when you start looking after your mental health. Zac’s grades all sit around the same range, because despite being a reliably high achiever, he over commits to extra-curricula and often hands assignments in several days late, which essentially stops him from getting higher than an 80. Tom has no reason for being the way he is.\nWe want to try and improve our predictions by building a model that forecasts next semesters results. Fynn’s house had a total of 69 drinks, while we had 77 and losing to another household (especially one as inferior as Fynn’s) is a blight on our competitive record. The problem with building a model is that there is very little data here, especially when compared to the number of variables at hand. This means even something as simple as a linear regression will have too much flexibility and will likely over fit, so to fix this, we need to turn to regularisation."
+ "text": "Overview\nThe fourth NUMBAT hackathon was held May 28-30, 2024 in Daylesford, Vic. A hackathon is style like RopenSci events where attendees brainstorm potential projects, and join to gether in small groups to tackle a subset of these.\nProjects\nProjects tackled can be found on github.\n\n\n\nParticipants\n\n\n\n\n\nWorking on the projects\n\n\n\n\n\nWorking on the projects\n\n\n\n\n\nWorking on the projects\n\n\n\n\n\nand great food"
},
{
- "objectID": "posts/regularisation/index.html#the-drinking-game-that-killed-tom",
- "href": "posts/regularisation/index.html#the-drinking-game-that-killed-tom",
- "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
+ "objectID": "posts/ConvolutionalNeuralNetwork/index.html",
+ "href": "posts/ConvolutionalNeuralNetwork/index.html",
+ "title": "Mario Party: Destroyer of Friendships and Explainer of Convolutional Neural Networks",
"section": "",
- "text": "Back when I lived with my sister I barely managed to drink once a month, and health wise, I was living a good life. Unfortunately, my sister decided to get engaged and her fiance was all “its weird that we live with your hermit sister” and “you two have been co-dependent for years its unhealthy and it’s time to stop”. When I moved in with my friends at the beginning of the year I was immediately tossed in to a long strict lock down. I had to deal with living in a house of people I liked, all of which had no schedule and were all bored out of their minds, so, to cut a long story short, we all started drinking a lot. I have since significantly cut back (I have gone back to my original “barely once a month” amount), but during our trial period for high functioning alcoholism, our friend Fynn introduced us to a “guess your uni grades” drinking game. Here is how it works:\n\nGuess your mark for all the units you are taking this semester\nWhen you get your results, calculate the difference between your guess and your actual result. e.g. if you guess 51 for Topology and actually get a 73 because the exam was scaled, you get 22 points.\nTake your points for each unit and sum them up. e.g. If topology got you 22 points, data structures 7 points, and a project unit was 3 points, your total is 32 points.\nIf you did not do a 4 unit work load, you scale your points up and round to the nearest integer to match that. e.g. if you had 32 points and did 3 units, your scaled score is 43.\nThe number of points you have is the number of drinks you have to do.\n\nFynn’s version is proper shots. Unfortunately because the example case was based on my housemate Tom, who apparently has next to no idea how he is doing in his courses, we had to change our rules to shots of mixed drinks. Even with this change we calculated that there was still a chance Tom would be killed by alcohol poisoning. After a 3-1 house vote we agreed we were willing to make that sacrifice for the sanctity of the game. My housemates in order of least to most points were:\n\nZac with 4 drinks\nMe with 13 drinks\nEm with 17 drinks\nTom with 43 drinks\n\n\n\n\nA visualisation of both the guessing and social order of the house. Here, Tom has died of alchohol poisoning.\n\n\nThis game led into a discussion about whose grades are the most difficult to predict. For example, there are things that seem to make guessing easier, such as completely in semester units. While things like my and Tom’s history of high variance puts us at a natural disadvantage. The first step to understanding what affects our changes in grades is to predict them. Figure 1 below gives a basic visualization of the house’s grades. The x axis represents the year and semester, however it is mostly arbitrary.\n\n\n\nThe house’s grades for every semester of uni.\n\n\nLooking at this plot we can visually see some of our personality quirks in the data. This plot makes it rather obvious which semester I had a mental breakdown and was diagnosed with ADHD. Em’s consistently high marks show the benefit of grade based anxiety, and the slight drop at the end shows the trade off that occurs when you start looking after your mental health. Zac’s grades all sit around the same range, because despite being a reliably high achiever, he over commits to extra-curricula and often hands assignments in several days late, which essentially stops him from getting higher than an 80. Tom has no reason for being the way he is.\nWe want to try and improve our predictions by building a model that forecasts next semesters results. Fynn’s house had a total of 69 drinks, while we had 77 and losing to another household (especially one as inferior as Fynn’s) is a blight on our competitive record. The problem with building a model is that there is very little data here, especially when compared to the number of variables at hand. This means even something as simple as a linear regression will have too much flexibility and will likely over fit, so to fix this, we need to turn to regularisation."
+ "text": "This is The Blog of a Mario Party Master\nIn pre-COVID times, my friends and I would gather around for a fun activity called “lets ruin our friendship by taking Mario party way too seriously”. The night always starts with laughter and few drinks, and ends with me standing on a chair, pointing at my closest friends, and screaming “I’m going to make you cry tears you thought were reserved for the death of your mother”. Once the moment has passed it seems a little dramatic, but at the time, we all truly believe that the speed you can get virtual candy out of a jar is an appropriate measure of human worth.\n\n\n\n\nThe last thing my friends see before I block them\n\n\n\nThere are several games that cause spite, but one called “Absent Minded”, pictured below, always sends us into an argument. Basically, you have 3 characters, and a slowly appearing image, and you have to find out which character is absent as the pictures become clearer. The faster you correctly identify, the more points you receive. I have never lost the game. Additionally there are 3 levels of this mini game, and so 3 different ways the images are shown to you: Jumbled Pictures, Blurred Pictures, and One At a Time.\n Example: the “One At a Time” level \nNow, obviously the levels are meant for humans to play, and not for teaching machine learning, but the challenge each level presents gives us an interesting way to view the concepts. The jumbled picture level can show us how our typical machine learning algorithm will view an image. The blurred picture level shows the benefit of using convolutional neural networks, and the one at a time level can go in the trash! Sorry, not every analogy will fit perfectly into a machine learning theory.\n \n\n\nHow Does The Picture Look to a Computer\nBefore I jump into explaining the concepts, I want to explain how your computer will “see” your image. Statistical models do not have eyes, and so for any picture we want to use, an observation needs to be converted in to a dataset. The process is illustrated below (although the variable size would be each pixel, and not limited by how small I can make my handwriting).\n \nFirst our image is broken up into its individual pixels. For greyscale they are typically given a single number to represent its “darkness”, and colour images are given three different values for red, green, and blue (RGB). This dataset is what will be used to represent your model (although I will use the image rather than the dataset for visualisations).\n\n\nPart One: The Jumbled Picture Level\n\nTheory: What’s Wrong With What We Already Have\nTechnically, we could use any other machine learning algorithm to classify an image. We can call these “bag of pixel” methods as they don’t consider the pixels location in the image. They essentially cut the image up into pixels, shake them up in a bag, toss them out, and classify based off these values. Ultimately, the problem with any “bag of pixel” model, is that it fails to capture the shape of the image independent of its location. This means only images that have the right features in the right area are correctly classified.\n\n\nAnalogy: What Makes the Jumbled Level Hard\n \nThe jumbled picture stage is interesting, because we cannot look at the characters as a whole to identify who is present. Since We cannot identify the pictures using the overall shape of the character, we need to look for the presence of independent key features. This reliance on identifiable features in the correct location is also what identifies our ordinary algorithms.\n\n\nPutting Them Together\nIn reality, this jumbling in our pictures would be at a pixel level, not big chunks, but the minigame is more of a point of illustration rather than a technical tool to understand machine learning. Key features being in the right place can be used successfully to identify images, but ultimately we have “zoomed in” too far to see the relationship between pixels. We can conceptualise this further with an example.\nIf we are trying to identify Mario, cap-looking pixels where Mario’s cap should be make it easy. If we have a picture where Mario doesn’t have his cap, that makes it hard. If we have a single picture where Mario is laying down so his cap is where his feet should be, that makes it even worse.\nThis is essentially the problem with our regular Machine learning algorithms. Key features in the right place make classification easy. No key features makes classification hard. Key features in uncommon places will be incorrectly assumed to be something else, and cause misclassification. This is where the benefits of using a convolutional neural network come in.\n\n\n\nPart 2: The Blurry Image Level\n\n\nTheory: How does a Convolutional Neural Network Work?\nBefore we return to explaining machine learning concepts in terms of Mario Party, lets take a step back, and look at how convolutional neural networks work in general. The illustration below is an overview of the structure of a CNN. The information contained in the image undergoes several transformations using layers that can be classified as either “feature extraction”, or “classification”.\n \n\nFeature Extraction Layers\nFeature extraction is what really sets the CNN apart from other models. These layers make new variables that are more “computer vision friendly”. The process creates nodes that identify certain aspects of an image, such as Yoshi’s Shell or Peach’s hair, and converts them to variables we can use to make predictions. The most common (and technically interesting) layers used in the process are explained below. The “options” are specifically related to building CNN in the R package, Keras.\n\nConvolutional Layers\n \nThe convolutional layer is what makes a neural network, a convolutional neural network. This layer creates a small window (called a kernel), that travels along the image and looks for a specific feature. The kernel_size option selects the size of the grid that is run along the image. Larger grids can overlook important details, while smaller grids can provide too much information and create noise. Typically the standard is somewhere in the range of a (3x3) grid. This information is taken from image to feature map using a filter. The filter is basically the type of feature we are looking for, when we run the kernel over the image. The number of times we do this, each with a unique filter, is the “depth” of the layer. In Keras, that is given by the filter option. As for which filters it uses, that is trained by the model.\nThe only convolutional layer that takes information from the image is the first one. All the following feature maps are computed on the previous feature maps. The new filter is applied to some combination of the previous layers feature maps and thus more convolutional layers mean variables that represent more intricate features.\n\n\nMax Pooling\n Max Pooling is a step in our convolutional neural network that is essentially a dimension reduction of our feature maps. Literally just think of it as doing no transformation to the image, other than shrinking it down. As with all dimension reductions, the goal here is to get rid of the pixels that contain noise (e.g. white space) and keep the pixels that identify the image (e.g. Mario’s cap). This layer reduces our chance of overfitting, and thus is a key player in the bias and variance trade off in convolutional neural networks.\n\n\nHow does it work?\nJust like the original image, feature maps can be seen as a grid of pixels. Max pooling sections each feature map into smaller non-overlapping grids, takes the largest value of those pixels, and moves it on to the next layer. The example illustration above is looking for the darkest pixel on a 2x2 grid. Grid size is important, we want to minimise the bias introduced into the model by keeping the grid small, but also eliminate noise and not make the grid so small the layer does nothing.\n\n\nWhy the Maximum?\nMax pooling is a rather counter-intuitive layer, statistically speaking. Through practice, it seems that the maximum is the measure that minimises this information loss, rather than measures of central tendency as we would expect. As to why, the best analogy I’ve seen for the max pooling stage is from the data sceptic podcast. If you are looking for your keys, and everyone in the group says they don’t have them but one person, you aren’t going to take the median or average value. We are not looking for how much the picture looks like Mario’s cap on average, we are looking for any sign of Mario’s cap.\n\n\n\nClassification Layers\n\nDense\nA dense layer allows takes the nodes from the previous convolutional layers, and make a fully connected layer. This essentially takes our feature maps as inputs and runs them through a typically neural network, which we won’t go into detail about here. Our final classification layer is also a dense layer, that outputs the probabilities of each classification option. This is the final output of the model.\n\n\n\nDropout Layers\nUnlike the previous layers, dropout layers can be placed among the feature extraction or classification layers. In fully connected neural networks its use is quite simple; it samples random nodes to remove from the layer, which prevents overfitting. This interpretation does not follow for dropout layers placed among the feature extraction layers (the reason is matrix multiplication but its not worth going into) however it still helps prevent overfitting. Sometimes the number of pooling layers is limited by the resolution of the image (we can only max pool it to be so small) so if we need an additional measure against overfitting, we can include dropout layers.\n\n\n\nAnalogy: The Blurry Image Level\n \nCircling back to Mario Party, the blurry levels difficulties are different to the jumbled level. Here, we struggle to make out key features, but must use the overall shape and colour to identify the character. As the image becomes clearer, it becomes easier to see, and we are more certain of our answer, however this additional certainty does not come without cost. The longer you wait to select an answer in the minigame, the more likely it is that you lose. This ultimately means that if the differences between characters are too subtle, the amount of time it will take to identify the difference isn’t worth the cost, and we are better off guessing.\n\n\nPutting It Together\nWaiting for certainty in the minigame is similar to overfitting in our convolutional neural networks. The deeper we go, the more likely it is that we overfit, and the more computationally heavy it is. We can add in dropout layers, but eventually there is an upper limit on the certainty we can have in our prediction. Unlike the normal models however, CNNs can capture shape AND the key features, they just need to be deep enough.\n\n\nMario Party and Convolutional Neural Networks: A Neat Analogy or Desperate Stretch to Make Statistics Seem Fun?\nObviously the CNNs have more nuance to them than can be explained using a Mario Party minigame, but it doesn’t do a bad job of giving us a conceptual understanding. Normal algorithms are limited by their inability to identify characters independent of their location, an issue we can circumvent using CNNs. CNNs capture the shape and general features of a character. Although really the most important learning experience from this post should be that if you come to my house to play Mario Party you might end up dead in a ditch somewhere.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
},
{
- "objectID": "posts/regularisation/index.html#what-is-regularisation",
- "href": "posts/regularisation/index.html#what-is-regularisation",
- "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
- "section": "What is regularisation?",
- "text": "What is regularisation?\nRegularisation is essentially a systematic way to reduce the variance and increase the bias of a model and improve the overall error through the bias variance trade off. There are quite a few regularisation methods, but I’m not going to go through all of them here. Rather I have summarised three of the more common techniques below.\n\nSubset selection: This technique selects a subset of the predictors to use in the regression. There are three common types of subset selection: forward subset selection, backward subset selection, and best subset selection. Forward subset selection starts with the null model and, at each step, adds the variable that reduces the test error the most, until the model is at a point where the addition of new variables don’t improve the the test error. Backward subset selection does the same but in reverse, it starts with the model containing all the variables and removes predictors until the test error does not improve. Best subset selection makes every possible model (the power set of the predictors) and chooses the one with the minimum error, however this can also over fit and is often computationally infeasible.\nPCA regression: You may remember principal component analysis (PCA) from one of my previous posts as an unsupervised learning method, but it can also be used as a regularisation technique. Using the principal components (which are just linear transformations of the original predictors) as predictors in an OLS regression can reduce the variance of the model.\nShrinkage methods: these methods make a new error function that is the sum of the residual sum of squares (RSS) and a penalty term and selects coefficients by minimising this new error function. The two main methods are lasso, which minimises the function RSS + \\lambda\\sum_{j=1}^{p}|\\beta_j|, and ridge which minimises the function RSS + \\lambda\\sum_{j=1}^{p}\\beta_j^2, where \\lambda is a tuning parameter. These additional penalties force the coefficient estimates to be closer to 0.\n\nThe method used in the example, and the main focus of the rest of this post, will be the shrinkage methods, as they have the most interesting theory and haven’t been explained previously on the blog. Now that we have seen how we perform regularisation, this still leaves the question why it works. There are two main benefits to regularisation, lower error and better model interpretability. I will explain how each of them work below.\nThe first reason to use regularisation is to reduce the variance of our model. Often, we do this implicitly by choosing a simple model due to a lack of data. For example, if we had built a neural network and found that the model had too much variance, we could instead build a random forest as a less flexible alternative. Regularisation is used when our model is already as simple as it can be, e.g. a linear regression or LDA in the case of classification, and it still has too much variance. We can’t get more data, and to remove a level of flexibility from a linear regression would be to predict the average (a constant). Regularization allows us to reduce this error from variance by further restricting the model parameters and thus allowing a model that is even more inflexible than a normal least squares linear regression.\nThe second reason to use regularisation is to improve the interpretability of the model. A large number of unnecessary variables not only introduces error, but also complicate the model. The benefit of using regularisation to improve model interpretability stems from the idea that there are signal and noise variables and we want to keep the signal variables while removing the noise variables. Regularisation removes predictors that have a spurious relationship to the response variable and leave us with less coefficients to interpret."
+ "objectID": "posts/internships/index.html",
+ "href": "posts/internships/index.html",
+ "title": "Can our Masters students help you?",
+ "section": "",
+ "text": "Do you have a data analysis task and want some help with it? We have a lot of Masters students who might be able assist."
},
{
- "objectID": "posts/regularisation/index.html#example-do-the-grades-need-a-regularised-model",
- "href": "posts/regularisation/index.html#example-do-the-grades-need-a-regularised-model",
- "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
- "section": "Example: Do the grades need a regularised model?",
- "text": "Example: Do the grades need a regularised model?\nTechnically we don’t need a reason to run a regularised model, it is just another method we can use to balance the bias and variance trade off, but in cases where there is a very small amount of data it is more useful to do than not. In this example we want to predict the grade of each unit in the up coming semester using several factors, such as student, department, whether the unit was in semester 1 or 2 (I suspect we do worse in semester 2 due to burn out), the level of the unit (most of us should do better in first year units), whether the unit was online, etc. There are also several interaction terms that could be useful, for example an interaction term between the Harriet student dummy and the online semesters would capture the later jump in my grades. There are obviously more interesting (and useful) variables we could include, such as whether we needed to move house during semester, if we went on a holidays during midsemester break, if we were seeing a therapist, etc. These variables would likely produce a better prediction and more easily interpreted coefficients, however I’m going to keep the model simple and leave it as is. Once we have our data set we can immediately see two reasons to use a regularised model over a normal OLS.\nFirst of all, the matrix is full rank, that is, we have variables that are a linear combination of other variables in the data set. For example, Tom and I are the only two students who take maths units (MTH), so with the student other department variables, the MTH variable becomes obsolete. There are several other variables with this issue. I’m not sure which variables are better to keep (department or student) and this issue will likely get worse as I add interaction terms.\nSecond of all, with such a small data set, any model with more than a handful of predictors will have a large amount of variance. Figure 2, below, shows the test and training error of a simple linear model that’s flexibility has been increased (somewhat arbitrarily) with the addition of new predictors. In this plot, a 0 in flexibility indicates a model that predicts the mean, while an 8 indicates a model that contains all the predictors in the data set as well as every every interaction term. This plot only shows the change in mean squared error (MSE) over a single sample of the data. To see the MSE of the model over several samples (and properly assess the variance of each model) we should do some resampling.\n\n\n\nThe trainning and test error compared with model complexity.\n\n\nFigure 3 shows the density of the test and training error of 50 samples of:\n\na basic linear model which predicts the mean of the training set for all test observations\na simple linear model which is an OLS model with only a handful of predictors I selected\na complex linear model which is an OLS model with every predictor and their interaction terms.\n\nThis gives us a cross validated version of Figure 2, and confirms what the previous plot indicated. First of all, it shows, a basic model has slightly too much bias because the training and test error are, on average, higher than the error of the simple model. It also shows that the complex model has over fit the data, given its consistently low training error and high unpredictable test error. We need a model that is somewhere between the complex “every variable” model and a constant. To find this model, we will use regularisation, specifically a shrinkage method.\n\n\n\nDensity plots of the training and test error of three linear models that differ in flexibility."
+ "objectID": "posts/internships/index.html#masters-of-business-analytics-students",
+ "href": "posts/internships/index.html#masters-of-business-analytics-students",
+ "title": "Can our Masters students help you?",
+ "section": "Masters of Business Analytics students",
+ "text": "Masters of Business Analytics students\nOur Masters of Business Analytics students are well-trained in the entire workflow of data analysis including data collection, munging, exploratory analysis, modelling and reporting. Our program is based around R, so all students should have relatively advanced R skills. Some of them may also have Python, SQL and other language skills. Our students are also taught to use modern reproducible practices and are comfortable with using git, Rmarkdown, etc.\nTowards the end of their degree, our students take on an internship in which they work with an organization on a data analysis project. The project is for a minimum of 225 hours (30 working days) to be undertaken at a time suitable to the organization and the student. This does not have to be done during normal teaching periods. Normally, the majority of the hours are to be spent on site (or virtually) embedded in the organization. It is not a requirement that students are reimbursed for this work, although some organizations choose to pay students a nominal wage during their internship. (Monash will provide insurance for all students provided they are not classified as employees.)\nSuitable projects should involve a substantial data analysis or modelling component, and should address a problem of interest to the sponsoring organization. At the start of the program, the organization, Monash University and the student must all agree on a suitable project.\nStudents will write a report of about 30 pages outlining the analysis they have undertaken. A copy of this report will be provided to the sponsoring organization, along with all the code that was produced as part of the analysis.\nEach student will need a supervisor from the sponsoring organization who must meet with the student at least weekly to ensure the student is on track. All participating students will also meet regularly (every few weeks) with a Monash University academic who can help with any technical issues.\nAt the conclusion of the project, the supervisor will need to provide a one page report to Monash University on the student’s performance.\nIf you think this might be of interest to your organization, please contact Professor Rob Hyndman."
},
{
- "objectID": "posts/regularisation/index.html#shrinkage-methods",
- "href": "posts/regularisation/index.html#shrinkage-methods",
- "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
- "section": "Shrinkage Methods",
- "text": "Shrinkage Methods\nThe most common regularisation methods are ridge and lasso regressions. Lasso and Ridge follow the same general idea, which is to put additional restrictions on the coefficients of a linear regression, they only slightly differ on how they go about it. Lasso, will minimise RSS + \\lambda\\sum_{j=1}^{p}|\\beta_j|, and ridge will minimiser RSS + \\lambda\\sum_{j=1}^{p}\\beta_j^2. The turning parameter \\lambda decides how much the penalty term influences the final coefficient values. A large value of \\lambda means the penalty term outweighs the RSS and coefficients are estimated at 0, a small value of \\lambda means the penalty will not be factored in at all and the model will return the OLS coefficient estimates. Figure 4 shows a contour plot of the lasso penalty, RSS function, and lasso error term for a two variable model. The animation shows the contours of the lasso regression look more like the contour plot of the penalty term as \\lambda increases. In turn we can see the minimum value of the error function (and thus the estimated coefficients) moves from the OLS estimates (the minimum of the RSS) to 0 (the minimum of the penalty).\n \nWhat may not be clear from this animation, but does simplify our ability to visualise how this adjusted error works, is that for every value of \\lambda there is some value of s such that we are minimising the RSS subject to the constraint \\sum_{j=1}^{p}|\\beta_j| \\leq s in the case of lasso and \\sum_{j=1}^{p}\\beta_j^2 \\leq s in the case of ridge. This means that instead of trying to think about a complicated constantly changing error function, we picture our restraints as shown in the illustration below. Here I have drawn a contour plot of a hypothetical RSS for a two variable model. The plot on the left has the ridge regression constraints drawn over it, while the plot on the right has the lasso constraint. The size of the circle/diamond is related to the tuning parameter \\lambda. When \\lambda=0 the area of the circle/diamond is infinite, and when \\lambda \\rightarrow \\infty the circle/diamond becomes so constrained it forces every coefficient to 0. This allows us to see how the constraint impacts the selected coefficient estimates.\n\n\n\nAn illustration of the difference between the ridge and laso regression constraints\n\n\nSomething that is important to note is that lasso regression is more likely to set coefficients to 0 (and thus more likely perform feature selection) than ridge due to the diamond shape of the constraint. The minimum RSS value in figure 4 showed this in practice, as the minimum coefficient estimate quickly set \\beta_1 to 0 before further restricting \\beta_2. Most commonly we will visualise the way the coefficients change as \\lambda increases with a plot of the coefficient vs \\lambda values, as drawn below.\n\n\n\nAn illustration of how the coefficients change as lambda increases.\n\n\nThere is one final question we need to answer before we move on. How do we decide whether to use ridge or lasso regression? Well, if you think all the variables are relevant, use ridge regression, if you suspect some variables to just be noise, use lasso. Now, with an understanding of how shrinkage methods work, we can go back to our example."
+ "objectID": "posts/permutation_variable_importance/index.html",
+ "href": "posts/permutation_variable_importance/index.html",
+ "title": "Using the Bachelor to Understand Permutation Variable Importance",
+ "section": "",
+ "text": "The season of the bachelor is upon us, and what better way to celebrate my love of drawn out reality TV, than to use it to explain permutation variable importance in the random forest model. For those who are not familiar, The Bachelor is a dating show where each week female contestants are eliminated when they do not receive a rose during the rose ceremony. The winner is famously difficult to predict, and many complicated factors (screen time, number of dates, ect) mean our variables are ever evolving through the season and difficult to use in analysis. Today we will not be predicting the winner of the bachelor (as fun as it sounds) but rather, we will use The Bachelor as the basis of an example in calculating variable importance.\n\nWhat Matters Most When Choosing A Partner\nAnyone who has viewed the show for many years starts to notice a trend in the girls who always make it to the end of the competition. In the image below I have circled the top six participants from last year’s season.\n\n\n\nNotice anything? The girls at the end of the bachelor are overwhelmingly blonde. Of course regular viewers would notice other things too. Like how every season has a group skydiving date that ends with one of the girls crying, overcoming her fear, and getting extra time with the bachelor (when I type this out the show sounds stupid). However we are going to focus on the hair, specifically how we can find out how important hair colour is in separating the winners from the losers.\n\n\nIntroducing Our Bachelorettes\nFor our illustration, let’s make an example competition that consists of 10 people, broken down into their most arbitrary characteristics: name, hair colour, eye colour, and job.\n\n\n\nObviously the real winner isn’t chosen on these characteristics alone, but this is a fake example and my fake bachelor is a shallow guy. First we give all the girls a final position in the fake competition, and assign them to one of three groups: finalists (top 3), place moderately (middle 4), and losers (bottom 3).\n\n\n\n\n\nA Normal Random Forest Model\nBefore we can even look at variable permutation, we need a random forest model. If you need refreshing on how they work, a random forest model will take B bootstrapped samples, and build a tree for each. Usually, just by chance, about a third of the contestants will not be used to build each tree, these are the out of bag contestants.\n\n\n\nTypically, for more complicated data sets, random forest models use a random subset of all the predictors at each node. However, Since we only have 3 predictors, we will ignore that for this example (it won’t have any major influence on our results). This model will have multiple trees, but for simplicity, we are only going to look at the first tree in depth, which is illustrated below.\n\n\n\nContestants 2,5,7, and 9 are our out of bag contestants and so were not used to build the tree. Running these four contestants through the tree we get our out-of-bag (OOB) error.\n\n\n\nNow at this point we have a bootstrapped sample, a tree, and an OOB error for all of the B trees in our forest (but we have only looked at the first in depth). This is the basis of a typical random forest model, and it is also what we will use as a point of comparison when we permute our variables.\n\n\nPermutation Variable Importance\nTo calculate the importance of a variable (in this case hair), we randomly permute that variable among the observations. This creates a new dataset where all the variables are the same EXCEPT for the one variable we are checking. So for the bachelor example, the girls have all the same characteristics as before except their hair colour is randomised.\n\n\n\nRationally, we can tell that if our Bachie isn’t using hair colour as a key decider for his life partner (as we would hope), randomising that variable would have no effect on the girls position in the competition. People getting divorced over dyed hair is no way for a society to function. Again, we calculate our OOB error, using the tree above and contestants 2,5,7 and 9. However, we now take our predictors from the table with the permuted hair variable.\n\n\n\nThis gives us an OOB error for the version of the bachelor where love is colour blind. The difference between the first OOB error and the OOB error for the permuted observations will give us the importance of hair colour in the first tree. We repeat this calculation for all trees in the forest, and take the average to find the overall variable importance. That in a nutshell is how we calculate the permutation variable importance.\n\n\nFinal Comments Before we Leave the Mansion\nIt easy to see the logic behind this method of calculating variable importance. If we are essentially rolling a dice to decide a variable, it shouldn’t be useful in making predictions. If previously that variable was important, we have caused serious damage to the predictive power of our model. While this isn’t a complete computation in variable importance (since we only calculated it for one tree and one variable), it’s purpose is to take a look under the hood of the process, and, hopefully, into the heart of our bachelor.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
},
{
- "objectID": "posts/regularisation/index.html#predicting-the-grade",
- "href": "posts/regularisation/index.html#predicting-the-grade",
- "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
- "section": "Predicting the Grade",
- "text": "Predicting the Grade\nLets apply this theory to our grades model to see if we can improve our predictions. Some of the variables are linear combinations of others, so there is absolutely no need to keep all the predictors. This means we should opt for lasso over ridge regression, although this does have one downfall. This example has a large number of interaction terms, and when we include interaction terms, we typically need to maintain a hierarchy so our variables are interpretable, e.g. we need to keep the Harriet and the Online dummy variables in the model if we want to include the Harriet:Online interaction term. Ridge and lasso regression do not follow this hierarchy when they shrink variables. Usually this would make predictability worse, however since every single predictor in this data set is a dummy variable, it isn’t going to cause (too) much of an issue. The main problem will be having almost no idea what the base line model is. From this point forward we will mostly focus on the improvements in test error, and continue with the lasso regression.\nTo find our lasso model, we need a \\lambda value. The best way to find this value is with cross validation, and thankfully the glmnet package does this for us. Figure 5, below, shows the mean test MSE and 95% confidence interval of the lasso regression for several values of \\lambda. The vertical dotted line indicates the \\lambda value that minimises the model error.\n\n\n\nSelecting our lambda value with the glmnet package’s cross validation method.\n\n\nWe can also visualise how our coefficients change as \\lambda increases. Figure 6 shows the change in the model coefficients as we allow \\lambda to approach 0 (or our L1 Norm to get infinitely large as shown on the x axis). The dashed line indicates the model associated with the \\lambda value found from cross validation. This allows us to better understand how some coefficients interact with each other. For example the Harriet:online interaction is the largest coefficient in every model, regardless of the \\lambda value, which indicates it is a consistently useful variable.\n\n\n\nThis plot shows the impact on our variables of a decreasing lambda (and thus increasing L1 norm).\n\n\nThe model that contains every variables as well as every student, unit level, department and online interaction term has 54 variables, the regularised model has only 20 variables, so there has been some serious culling. Figure 7 shows the predictors that made it into the final model. Since the baseline model (the one that we compare each dummy variable to) is now a mess, these coefficients are almost impossible to interpret.\n\n\n\nThe lasso model coefficients.\n\n\nFinally, we can compare the lasso model to the basic, simple, and complex models from figure 3. Figure 8 compares the cross validated RMSE of the three old models and the new lasso model. We can see that the simple model (that was just the student and online variables as well as all their interaction terms) may slightly outperform the lasso model, however there is so much overlap in the confidence intervals it is hard to say. In this example, the lasso model did not select variables that were better than my general intuition. Lasso can help you regularise to some degree, but even regularisation techniques can be given too many predictors to choose from, and it seems my intuition was enough to beat it this time.\n\n\n\nThe RSME of the final lasso model when compared to the previous models over several resamples."
+ "objectID": "posts/bias_variance_flexibility/index.html",
+ "href": "posts/bias_variance_flexibility/index.html",
+ "title": "A Deep Dive into How Flexibility Affects The Bias and Variance Trade Off",
+ "section": "",
+ "text": "When we are building a machine learning model you have a choice of a simple, which would be an inflexible, model vs a complicated, or very flexible model. We need to decide how flexible the model should be to work well for future samples. An inflexible model may not reflect a complex underlying process adequately and hence would be biased. A flexible model has the capacity to capture a complex underlying process but the fitted version might change from one sample to another enormously, which is called variance. This difference is illustrated in the figure below.\nWhen we think of how the bias and variance change with flexibility, we typically only look at its behaviour on average. In the plot below, the left side corresponds to an inflexible model and the right side corresponds to a flexible model. We can see that the test error stay slightly above the training as flexibility increases, until the text error shoots up. Visualisations like this are shown frequently in the textbook “An Introduction to Statistical Learning with Applications in R” by Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani, which largely inspired this blog post. While this explains the behaviour of our test error on average, it doesn’t give a complete understanding of how our test error estimate will act within any individual sample. This is where we find the benefit of understanding the error distribution. The distribution of the test error allows us to not only understand the average behaviour, but also how that behaviour may change from sample to sample."
},
{
- "objectID": "posts/regularisation/index.html#conclusion",
- "href": "posts/regularisation/index.html#conclusion",
- "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
- "section": "Conclusion",
- "text": "Conclusion\nRegularisation can be used to reduce the variance and improve the interpretability of our model, but human intuition can still outperform it if we know enough about our data. That being said the models for our grade predictions turned out to be useless. Results for this semester have been released and Tom was 32 off, while the simple model was 55 off. Really, the lesson here is that no model, no matter how fine tuned, can predict a response variable that has an inexplicably high variance."
+ "objectID": "posts/bias_variance_flexibility/index.html#flexibilitys-influence-on-test-error",
+ "href": "posts/bias_variance_flexibility/index.html#flexibilitys-influence-on-test-error",
+ "title": "A Deep Dive into How Flexibility Affects The Bias and Variance Trade Off",
+ "section": "Flexibility’s Influence on Test Error",
+ "text": "Flexibility’s Influence on Test Error\nWhen changing the flexibility of a model, the test error distribution will go through three phases, that affect both its expected value, and variance.\n\nPhase 1: Decreasing Bias in Model\nWhen our model is biased, we are forcing our data into constraints that don’t reflect the true relationship between the variables. Since we have not captured the true relationship of the parameters, any sample drawn from our population will also have a more complicated relationship than that of our model, and have error from bias. This relationship is illustrated below, where our high error is largely the result of too much bias in the model. Both distributions are similar to each other, but far from zero.\n\n\n\n\n\nPhase 2: Optimal Fit\nIncreasing the flexibility will reduce the bias which will decrease the error. The optimal error will have smaller error for both training and test, but they will both be pretty similar. If you have captured the true relationship of the data with your model (if there is one), the distributions should perfectly overlap. This is unlikely to happen, since your model will always have a bias towards any quirks in your training set, and thus perform better on that set most of the time. So we instead will interpret the optimal fit is when the test error reaches its minimum (before the variance causes the total error to start to increase).\n\n\n\n\n\nPhase 3: Increasing Variance in Model\nAs we start to overfit our model, we introduce more error from variance than we are losing from decreasing bias. This has two effects on the distribution of the estimated test error. First, it causes the distribution to shift upwards as we have once again missed the true relationship in the population. This miss is different from bias however, as we have overfit our model to the specifics of the test set sample, thus new samples drawn from the same population will not have a similar error. This causes the distributions to shift away from each other. Additionally, the variance of the test error estimate will also increase. Overfitting means a higher penalty for samples that just happen to be different from our training set, and a higher reward for those that just happen to have similar quirks. Ultimately that makes the estimates more unreliable, and thus have a higher variance.\n\n\n\n\n\nUnderstanding with an Example\nThis influence from flexibility can best be seen with an example. To illustrate this, we will use the Auto data from the ISLR package, and fit a model to predict mpg using a polynomial of horsepower. If we take a look at the scatterplot of the two variables below, we can see that the linear model might not be flexible enough, but anything more flexible than a polynomial of about 4, will very likely overfit to the training sample. The plot below shows the data with a loess fit.\n\n\n\n\n\n\n\n\n\nWe can see the effect on the distributions using the animated density plot below. Here we have taken 100 different samples, and fit a model that predicts mpg using a polynomial degree of 1 to 15 of horsepower. Here we can see the above hand drawn illustration and interpretation of the variable relationship play out. Initially, increasing the flexibility of our model eliminates bias and causes both distributions to shift down. At polynomial degree 4, they stop at the minimum, and then for polynomial degrees higher than that, variance is introduced, and the test error increases in both expected value and variance.\n\n\n\n\npng"
},
{
- "objectID": "posts/MLE/index.html",
- "href": "posts/MLE/index.html",
- "title": "How a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)",
- "section": "",
- "text": "As a child, your parents are seen as deities that can do no wrong, that is until you are doing a first aid course 10 years later and learn that a broken arm is not an “I’ll take you to the hospital tomorrow if it still hurts” level emergency. Growing up I started to realise my Dad’s life lessons were somewhat unorthodox and below are some of my favourite quotes.\n\n“If you are going to light fires with your brother, make sure you do it by the pool. The ground is paved and if you set something on fire I’d rather it be you than the house”\n“I’m not going to any of your soccer games until you turn 12. I watched your brother play when he was younger. It was very boring and the other kids parents were insufferable”\n“If someone wants you to do something, they will probably pay you for it. So make sure you get paid. Unless I ask, then you need to do it for free.”\n\nThe last quote was probably the worst thing he taught us, at least for his financial security. It meant my siblings and I learnt to squeeze as much money out of my parents as we could. They paid me to cut my hair, get to school on time, go to swimming lessons, nothing was ever done for free. I even haggled my baby teeth with my mum for $50. This idea expanded to school yard, where my peers were much poorer than my parents, but also easier to part from their money. In grade 2 I had a period of selling stickers outside the tuckshop for spare change. The profit system was simple, sell stickers to my peers for a 10000% mark up. This plan was eventually shut down by “the man” i.e. the staff because some parents had complained about their kids not even getting lunch and just buying stickers. My most effective and long lasting money making plan however, was birthday parties. By the end of middle school, I was pulling in about $2000 a party. One of the most important elements in making a birthday profitable was the ratio of kids invited to kids that turn up. At the time I did guesstimation on this ratio, but now we are going to look at it in a more formal manner, using Maximum Likelihood Estimation (MLE)."
+ "objectID": "posts/bias_variance_flexibility/index.html#sample-to-sample-changes",
+ "href": "posts/bias_variance_flexibility/index.html#sample-to-sample-changes",
+ "title": "A Deep Dive into How Flexibility Affects The Bias and Variance Trade Off",
+ "section": "Sample to Sample Changes",
+ "text": "Sample to Sample Changes\nHere it is important to highlight the difference between a population and a sample, so we can better understand how an unfortunate test and training split can hurt error estimates. A population is all the data on what you are trying to make an inference on. For example, if I want to make an inference on the true relationship between mpg and horsepower, the Auto data is a sample of that. Generally we would be interested to make statements for mgp and horsepower for all possible cars, where all possible cars would be our population. If I want to make an inference on the relationship between mpg and horsepower in the Auto dataset (which is a weirdly specific realm to keep your inferences to but each to his own I guess) then this data is the population sample. For our sample to be representative, it needs to both be randomly drawn, and large enough. Unfortunately, even when we draw our samples to be decently large in size, and random, we will still occasionally get some unrepresentative splits. A sample that is unlike the population will bring the validity of any inference we try to make using that sample (including predictive models) into disrepute. Below is an illustration on how the sample will influence the fit among other interpretations.\n\n\n\n\npng\n\n\n\nThat being said, it’s highly unlikely to get a difference that dramatic in an actual sample. In reality, minor, almost invisible to the eye differences in your sample will create large differences in your MSE estimates.\n\nAn Example of Sample Influence on Error\nThe scatterplots below shows two of the training and test sample splits that were used in the phases example. One produced the best test error on the polynomial 15 model (MSE= 105) and the other, the worst (MSE=9837). Is there a remarkable difference?"
},
{
- "objectID": "posts/MLE/index.html#step-1-identify-joint-density-function",
- "href": "posts/MLE/index.html#step-1-identify-joint-density-function",
- "title": "How a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)",
- "section": "Step 1: Identify Joint Density Function",
- "text": "Step 1: Identify Joint Density Function\nI mentioned in the beginning that the function that can easily cross between the worlds of the outcomes and parameters is the joint density function, but what is it? Basically it takes a bunch of random variables (in our case, students) and says what is the chance the entire group has a specific outcome (in this case, attend or not attend). \nMoving from each students probability to the joint probability is not simple task. We can either create a model that understands the intricate inter-personal relationships of this second grade class, or we can make 3 assumptions that will greatly simplify the problem. Statisticians generally prefer the method that doest require reading children’s diaries and so we are now going to perform these assumptions on my second grade class.\n \n\nAssumption 1: Independence\nThe first assumption we are going to make about our students is that they are independent, i.e any particular kid going to my party is not at all influenced by any other student. In doing so we now have a dataset consisting of children who are completely immune to peer pressure, both a school administration and statisticians fantasy. Unfortunately we have also lost some realism along the way. Why do this? Well, right now the only thing we know is each student has a some probability mass function (which we will get to in assumption 3) but no information on how the students interact with each other. What if one student coming means other wont come? or a students coming means another will certainly come? In order to find the probability of this very specific outcome we have ended up with we need information about the variables relationship. Here we can either figure out the complicated interpersonal relationships of the children, or assume they are all independent. With this assumption, the joint PMF is the product of each individual PMF (this is literally the definition of independence). Now our students don’t interact, and we have taken our first step in simplifying our problem.\n \n\n\nAssumption 2: Identical\nNow our joint PMF is the product of 24 unique PMFs. The problem is, I don’t really care about the individual students (they are all dollar signs to me). I only care about the overall proportion of students. Here we can simplify our problem further by assuming there is some overall class PMF, and every student is just a random variable drawn from that. To use this assumption in our joint density function, we just say the probability of every student coming is the same. Now we have 24 observations drawn from a single distribution, which means we only need a single individual PMF to define the PMF of all the students.\n \n\n\nAssumption 3: Identify The Distribution For Individual Parameters\nAs a final step, we still need some individual PMF to put in the big product we have created. Since every student either comes or doesn’t come, we can easily say the PMF for each student follows a Bernoulli distribution. Ultimately this step just depends on what outcome you want to measure, and since I only really care about a yes/no outcome, a Bernoulli will do just fine. Now we have a joint PMF to work with."
+ "objectID": "posts/bias_variance_flexibility/index.html#how-our-estimation-method-influences-our-test-error-distribution",
+ "href": "posts/bias_variance_flexibility/index.html#how-our-estimation-method-influences-our-test-error-distribution",
+ "title": "A Deep Dive into How Flexibility Affects The Bias and Variance Trade Off",
+ "section": "How Our Estimation Method Influences Our Test Error Distribution",
+ "text": "How Our Estimation Method Influences Our Test Error Distribution\nA glaring issue with our test error estimate is its high variance, which means less certainty in the conclusions we draw from our test estimates. If we want a test error estimation method that is less susceptible to this issue of variance, we could try using a cross validation method. All methods, like the test error shown above, will still follow the general phases caused by increasing flexibility, but some have a lower overall variance (at the cost of more bias).\n\nThe Phases Example Using Cross Validation\nWhen we originally looked at the test error, it was estimated with the validation set approach (test in the plot) for simplicity. Now, let’s redo those distribution estimations of error from the mpg and horsepower models, but also look at the distribution of the 10-fold (k10cv), and 5-fold cross (k5cv) validation methods.\n\n\n\n\npng\n\n\n\nHere we can see the bias variance trade off play out with our estimates of test error, just as they would with our model fit. Our cross-validation methods in order of increasing variance are:\n\n5-fold CV < 10-fold CV < Validation Set Method\n\nThe methods in order of increasing bias are:\n\n10-fold CV < 5-fold CV < Validation Set Method\n\nIn general, the k-fold CV bias and variance depends on the value of k, where LOOCV (k=n) is approximately unbiased."
},
{
- "objectID": "posts/MLE/index.html#step-2-make-your-likelihood-function",
- "href": "posts/MLE/index.html#step-2-make-your-likelihood-function",
- "title": "How a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)",
- "section": "Step 2: Make Your Likelihood Function",
- "text": "Step 2: Make Your Likelihood Function\nWow what a beautiful joint PDF… What do we do with it? Well I said in the beginning that the function that gives probability of outcomes and the function that gives probability of parameters is the same function just with a different unknown. Here are the two directions we could take with our joint PMF. \nSince in this case our unknown is the parameter, we are going to use the likelihood function. Here we can actually put find the Likelihood function for our particular birthday party results.\n\nL(\\theta)=\\theta^{18}(1-\\theta)^{6}\n\nBut to simplify it here with the outcomes wouldn’t be an accurate representation of how we usually have to conduct MLE. So I’m going to leave in the product notation. Now that we have a function that shows how likely different values of \\theta (the probability a student turns up) are, we need to find its maximum."
+ "objectID": "posts/bias_variance_flexibility/index.html#to-summarise",
+ "href": "posts/bias_variance_flexibility/index.html#to-summarise",
+ "title": "A Deep Dive into How Flexibility Affects The Bias and Variance Trade Off",
+ "section": "To Summarise…",
+ "text": "To Summarise…\nAs the flexibility of our model increases, we know that the estimated model will have a decrease in bias and increase in variance. This change in our model causes both a change in the mean and variance of our estimated test error. A lot of the difference is caused by the increasing impact of our random sample split, however it is not something that is visually noticeable. Like the model, the method of test error estimation also has its own bias and variance trade off, and it can be balanced using cross validation methods.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
},
{
- "objectID": "posts/MLE/index.html#step-3-math-time-logs-and-derivatives-and-more-derivatives",
- "href": "posts/MLE/index.html#step-3-math-time-logs-and-derivatives-and-more-derivatives",
- "title": "How a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)",
- "section": "Step 3: Math Time : Logs and Derivatives and More Derivatives",
- "text": "Step 3: Math Time : Logs and Derivatives and More Derivatives\nI mostly wanted to focus on the difference between a PMF/PDF and a likelihood function in this post, but for the sake of completeness I’m going to finish the estimation. That being said I’m not going to be very detailed. Our next step in the process is to take the log.\n\nWhy take the log?\nThe answer is really just mathematicians are lazy. From high school you may remember that when you want to find the maximum of a function you take the derivative and set it equal to 0. The thing is, we have a massive product right now, and the product rule is a pain to do. Especially when we have 24 functions multiplied together. Thanks to log laws, taking the log of this nightmare function both doesn’t change the value the maximum is at (thanks to some math stuff I won’t go into) and also means we have to take the derivative of a big sum instead of a big product, which is really easy.\nL(\\theta)=\\prod^{24}_{i=1}\\theta^{x_i}(1-\\theta)^{1-x_i}\nI’m going to do some cosmetic changes before applying the log.\nL(\\theta)= \\theta^{\\sum_{i=1}^{24} x_i}(1-\\theta)^{24-\\sum_{i=1}^{24}x_i}\nThen we have our log-likelihood.\nlogL(\\theta)= log(\\theta)\\sum_{i=1}^{24}x_i+ log(1-\\theta)(24-\\sum_{i=1}^{24} x_i)\n\n\nThe first derivative\nNow we take the first derivative. When our likelihood function has a rate of change of 0, it’s about to fall back down. So we take the derivative with respect to the value we want to maximise and find the parameter that is the most likely given our set of outcomes.\nlogL'(\\theta) = \\frac1\\theta {\\sum_{i=1}^{24} x_i}- \\frac1{1-\\theta}(24-\\sum_{i=1}^{24} x_i)\nSince the first order condition is that we would like the first derivative to be equal to 0, this is where I put the hat in because this isn’t true in general, only for our estimate.\n\\frac1{\\hat{\\theta}} {\\sum_{i=1}^{24} x_i}- \\frac1{1-\\hat{\\theta}}(24-\\sum_{i=1}^{24} x_i)=0\nWhich we solve to find\n{\\hat{\\theta}} = \\frac1{24}\\sum_{i=1}^{24} x_i\nNow that we have the solutions we can substitute in our values from our sample of party go-ers and get the probability any one person will turn up.\n{\\hat{\\theta}} = 0.75\n\n\nThe second derivative\nThe lazy of us ignore this step, although it is technically still important. I also tend to ignore it, and will do so here for the sake of brevity. Whoops. We already have our estimate, this is more about making sure we have a clean solution. Taking the second derivative ensures our estimate is a maximum, and not some other stationary point."
+ "objectID": "posts/2022-02-18-do-you-want-a-monash-business-analytics-intern/index.html",
+ "href": "posts/2022-02-18-do-you-want-a-monash-business-analytics-intern/index.html",
+ "title": "Do you need some analytics help? Maybe our internship program is for you!",
+ "section": "",
+ "text": "We have a new Masters in Business Analytics in the Econometrics and Business Statistics department in the Monash Business School and in the final semester of the program, the students have an option to do an internship. The aim is to give our students some work experience in analytics and to build links with various companies, institutions, and charities.\nThe program is highly selective and has an annual intake of around 50-60 student. The majority of our students will undertake the internship component of the program.\nThe students will have advanced R and data science skills. As well as having experience with various types of statistical analysis and machine learning, they have strong training in modern techniques for building reproducible analysis pipelines, automated reporting, and building interactive tools for understanding data.\nThe students will also meet regularly throughout the project with an academic from Monash and their fellow students to discuss problem solving and analysis techniques that are relevant to their projects. Previous students have worked on a diverse selection of projects including"
},
{
- "objectID": "posts/MLE/index.html#conclusion",
- "href": "posts/MLE/index.html#conclusion",
- "title": "How a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)",
- "section": "Conclusion",
- "text": "Conclusion\nI used all the money I made from birthday parties to buy about $10 000 worth of manga books because I was a huge Weeb. Sadly I ended up donating them all to the school that expelled me in year 11. Turns out being mercenary enough to make buckets of money as a child doesn’t matter if you waste it all on books you are forced to give away when you move to Melbourne because student accommodation doesn’t come with a wall of free library space. I’m sure there is a lesson in here somewhere.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
+ "objectID": "posts/2022-02-18-do-you-want-a-monash-business-analytics-intern/index.html#footnotes",
+ "href": "posts/2022-02-18-do-you-want-a-monash-business-analytics-intern/index.html#footnotes",
+ "title": "Do you need some analytics help? Maybe our internship program is for you!",
+ "section": "Footnotes",
+ "text": "Footnotes\n\n\nMonash will provide insurance for all students provided they are not classified as employees.↩︎\nThis may be anonymised, deidentified, or redacted as required, as long as it is still possible to assess the student’s work.↩︎\nWe are happy to discuss other options for assessing the code if using github or sharing the code is not feasible for your company. This should be decided before the internship begins.↩︎"
},
{
- "objectID": "posts/2022-10-18-cran-rev-dep/index.html",
- "href": "posts/2022-10-18-cran-rev-dep/index.html",
- "title": "Diving into dependen-“sea”",
+ "objectID": "posts/2022-05-28-ggplot-sf/index.html",
+ "href": "posts/2022-05-28-ggplot-sf/index.html",
+ "title": "How long do maps on ggplot facets take?",
"section": "",
- "text": "When writing a package, we may want to use functions in other packages. This creates a dependency for our package and a reverse dependency on the package we borrow functions from. As one of the recipients of the isoband email1, I’m curious to know how interconnected CRAN packages are. Luckily, it is not too hard to get data on this, and so the journey begins…"
+ "text": "If you’re a ggplot user, making faceted plots must be a tool in your belt. If you happen to do some spatial analysis, you would be familiar with maps. Today, I will show you my surprise findings about the rendering time to make faceted maps.\nThis example comes from Chapter 7 of Paula Moraga’s book Geospatial Health Data: Modeling and Visualization with R-INLA and Shiny and I have simplified it for this demonstration. In essence, there are two datasets:\nSimple feature collection with 88 features and 1 field\nGeometry type: POLYGON\nDimension: XY\nBounding box: xmin: -84.8203 ymin: 38.40342 xmax: -80.5182 ymax: 42.32713\nGeodetic CRS: NAD83\n# A tibble: 88 × 2\n NAME geometry\n <chr> <POLYGON [°]>\n 1 Auglaize ((-84.13476 40.65755, -84.13467 40.65755, -84.13405 40.65753, -84…\n 2 Crawford ((-82.77258 40.99589, -82.77258 40.99588, -82.77168 40.99588, -82…\n 3 Montgomery ((-84.06231 39.8366, -84.06301 39.83665, -84.06501 39.83677, -84.…\n 4 Guernsey ((-81.22986 40.06315, -81.22987 40.06308, -81.22992 40.06119, -81…\n 5 Clark ((-83.83875 39.8233, -83.83889 39.82335, -83.83904 39.82339, -83.…\n 6 Gallia ((-82.18737 38.72608, -82.18727 38.72558, -82.18707 38.72488, -82…\n 7 Fairfield ((-82.82307 39.80773, -82.82307 39.8078, -82.82305 39.80801, -82.…\n 8 Darke ((-84.43157 40.15801, -84.43148 40.15487, -84.43148 40.1542, -84.…\n 9 Monroe ((-81.22569 39.57838, -81.24065 39.57883, -81.2413 39.57885, -81.…\n10 Portage ((-81.3184 40.98861, -81.31892 40.98862, -81.31927 40.98862, -81.…\n# … with 78 more rows\n# A tibble: 1,848 × 3\n county year SIR\n <chr> <dbl> <dbl>\n 1 Adams 1968 0.725\n 2 Adams 1969 0.588\n 3 Adams 1970 1.03 \n 4 Adams 1971 0.654\n 5 Adams 1972 1.05 \n 6 Adams 1973 0.693\n 7 Adams 1974 1.15 \n 8 Adams 1975 1.17 \n 9 Adams 1976 0.936\n10 Adams 1977 0.644\n# … with 1,838 more rows\nThe details on calculating SIR is not the focus of this post and Section 7.1 to 7.2 of Paula’s book has detailed all the steps. Here I attach the script to generate these two data sets in case you would like to give it a spin:\nWhat we would like to do here is to show the SIR values of each county on the map across years. This would require us to join the two datasets, supply the combined data into ggplot, plot the underlying map, fill the county polygon with SIR, make facets with year, and lastly tweak the theme and the fill scale. Let’s give this plot a name, say target:\ncombined <- ohio %>%\n left_join(sir, by = c(\"NAME\" = \"county\"))\n\ntarget <- combined %>%\n ggplot() +\n geom_sf(aes(fill = SIR)) +\n facet_wrap(~year, dir = \"h\", ncol = 7) +\n ggtitle(\"SIR\") +\n theme_bw() +\n theme(\n axis.text.x = element_blank(),\n axis.text.y = element_blank(),\n axis.ticks = element_blank()\n ) +\n scale_fill_gradient2(\n midpoint = 1, low = \"blue\", mid = \"white\", high = \"red\"\n )\n\ntarget\nEasy peasy.\nBut, have you thought about how long it would take to provide this plot to you?\nLet me show you some components of this plot as benchmarks on the timing, here I have:\nOkay, now it is your time to make a guess:"
},
{
- "objectID": "posts/2022-10-18-cran-rev-dep/index.html#footnotes",
- "href": "posts/2022-10-18-cran-rev-dep/index.html#footnotes",
- "title": "Diving into dependen-“sea”",
+ "objectID": "posts/2022-05-28-ggplot-sf/index.html#footnotes",
+ "href": "posts/2022-05-28-ggplot-sf/index.html#footnotes",
+ "title": "How long do maps on ggplot facets take?",
"section": "Footnotes",
- "text": "Footnotes\n\n\nOn 5th Oct, CRAN sent out a massive email to inform 4747 downstream package maintainers of the potential archive of package isoband on 2022-10-19.↩︎"
+ "text": "Footnotes\n\n\nTo make a proper benchmark of time, ideally each plot (p1 - p21) should be evaluated repetitively to obtain a distribution of the elapsed time. I set up a script with 50 repetitions and let it run overnight, but what I got next morning was “RStudio quit unexpectedly”. I suspect there is something going on with opening and closing the graphic devices too many times…↩︎"
},
{
- "objectID": "posts/hackathon_2023/index.html",
- "href": "posts/hackathon_2023/index.html",
- "title": "Hackathon 2023",
+ "objectID": "posts/pca/index.html",
+ "href": "posts/pca/index.html",
+ "title": "Using PCA to Bully My Housemates (Specifically Tom)",
"section": "",
- "text": "Overview\nThe third NUMBAT hackathon was held Feb 22-24, 2023 in San Remo, Vic. A hackathon is style like RopenSci events where attendees brainstorm potential projects, and join to gether in small groups to tackle a subset of these.\nProjects\nProjects tackled can be found on github.\n\n\n\nParticipants\n\n\n\n\n\nBrainstorming the projects\n\n\n\n\n\nMaking sushi\n\n\n\n\n\nAussie barbecue - love the aprons!\n\n\n\n\n\nPelican feeding"
+ "text": "I recently moved into a share house with three of my friends, and while we generally get along pretty well, I would be lying if I said I never fantasised about burning the place down with them all in it. Today, after I woke up to the dishwasher run with less that half a load, I made this passive aggressive drawing and sent it to the group chat. I have great conflict resolution skills.\n\nThe three people I live with all know me, but none of them know each other, and so as the central housemate, I have often wondered if this clear social dynamic appears in our communication (such as messenger data). This is something that could be easily found through a combination of a principal component analysis (PCA) and violating my housemates privacy. Both of which are enjoyable and straightforward. When PCA was introduced to me in uni, I struggled a bit to understand the plots. So, while I’m violating my housemates privacy, I’m also going to go over the ‘gist’ of PCA and unravel the plots that come with it."
},
{
- "objectID": "posts/boosting/index.html",
- "href": "posts/boosting/index.html",
- "title": "Learning Boosting Through Me Getting Fired from Tutoring",
+ "objectID": "posts/pca/index.html#surrounded-by-incompetence",
+ "href": "posts/pca/index.html#surrounded-by-incompetence",
+ "title": "Using PCA to Bully My Housemates (Specifically Tom)",
"section": "",
- "text": "I’ve had about… 13 jobs at this point in my life. Among them were jobs like tutoring, nannying, swim teaching, ect. so I have developed had a decent level of experience in teaching kids, specifically teaching them maths. While swim teaching doesn’t seem like it employs a lot of maths, I would play a “who can get closest to the number I’m thinking” game to decide who goes first. I then used it to explain game theory and how to optimise their strategy based on what the other kids would pick if they were acting as rational agents. They didn’t fully understand, but it was a fun exercise.\n\n\n\nI was never a very good tutor because I have a tendency to overcomplicate simple problems, and argue with the student’s teacher or parent. A recurring personality trait that is likely apparent after reading enough posts from this blog. The worst case of this was when I was fired from my tutoring job several years back. But what do my failures as a tutor have to do with boosting?.\n\n\nI have always seen boosting as the one of the most intuitive ensemble methods. For anyone who doesn’t know, an ensemble model combines many individual models to create one aggregate model that tends to have greater accuracy and less variance than any of the individual models. Think of it as the machine learning version of everyone voting to make a decision instead of a single expert making a decision. If we relate them to human studying, boosting is like doing every question at least once and then only revising the questions we previously got wrong. This makes boosting more similar to the way I study, and try to teach my (previous) tutoring students. In machine learning, boosting builds the models sequentially where each new model is built on the residuals of our current model, and only takes a small amount of predictive power from each (known as the learning rate). To see how this in action, lets look at the animation below.\n\n\n\nHere, I have made a boosting model using 50 regression trees that each consist of a single split, and have a learning rate (how much information we take from each tree) of 0.1. The colour represents the value for y. In the background we have the current predicted values for that area, and the actual data we are working with in the foreground. The size of the data represent the current error for that observation. It is pretty apparent that the data points become smaller as the background (predicted value) more closely resembles our training data. Each dashed line indicates the most recent regression tree (or in this case stump) that has been added to the model. Since this is a model that progressively learns, both the error and prediction change as we incorporate more and more models. Now that we have a visual on how boosting works, lets talk about tutoring."
+ "text": "I recently moved into a share house with three of my friends, and while we generally get along pretty well, I would be lying if I said I never fantasised about burning the place down with them all in it. Today, after I woke up to the dishwasher run with less that half a load, I made this passive aggressive drawing and sent it to the group chat. I have great conflict resolution skills.\n\nThe three people I live with all know me, but none of them know each other, and so as the central housemate, I have often wondered if this clear social dynamic appears in our communication (such as messenger data). This is something that could be easily found through a combination of a principal component analysis (PCA) and violating my housemates privacy. Both of which are enjoyable and straightforward. When PCA was introduced to me in uni, I struggled a bit to understand the plots. So, while I’m violating my housemates privacy, I’m also going to go over the ‘gist’ of PCA and unravel the plots that come with it."
},
{
- "objectID": "posts/boosting/index.html#my-employment-history",
- "href": "posts/boosting/index.html#my-employment-history",
- "title": "Learning Boosting Through Me Getting Fired from Tutoring",
- "section": "",
- "text": "I’ve had about… 13 jobs at this point in my life. Among them were jobs like tutoring, nannying, swim teaching, ect. so I have developed had a decent level of experience in teaching kids, specifically teaching them maths. While swim teaching doesn’t seem like it employs a lot of maths, I would play a “who can get closest to the number I’m thinking” game to decide who goes first. I then used it to explain game theory and how to optimise their strategy based on what the other kids would pick if they were acting as rational agents. They didn’t fully understand, but it was a fun exercise.\n\n\n\nI was never a very good tutor because I have a tendency to overcomplicate simple problems, and argue with the student’s teacher or parent. A recurring personality trait that is likely apparent after reading enough posts from this blog. The worst case of this was when I was fired from my tutoring job several years back. But what do my failures as a tutor have to do with boosting?.\n\n\nI have always seen boosting as the one of the most intuitive ensemble methods. For anyone who doesn’t know, an ensemble model combines many individual models to create one aggregate model that tends to have greater accuracy and less variance than any of the individual models. Think of it as the machine learning version of everyone voting to make a decision instead of a single expert making a decision. If we relate them to human studying, boosting is like doing every question at least once and then only revising the questions we previously got wrong. This makes boosting more similar to the way I study, and try to teach my (previous) tutoring students. In machine learning, boosting builds the models sequentially where each new model is built on the residuals of our current model, and only takes a small amount of predictive power from each (known as the learning rate). To see how this in action, lets look at the animation below.\n\n\n\nHere, I have made a boosting model using 50 regression trees that each consist of a single split, and have a learning rate (how much information we take from each tree) of 0.1. The colour represents the value for y. In the background we have the current predicted values for that area, and the actual data we are working with in the foreground. The size of the data represent the current error for that observation. It is pretty apparent that the data points become smaller as the background (predicted value) more closely resembles our training data. Each dashed line indicates the most recent regression tree (or in this case stump) that has been added to the model. Since this is a model that progressively learns, both the error and prediction change as we incorporate more and more models. Now that we have a visual on how boosting works, lets talk about tutoring."
+ "objectID": "posts/pca/index.html#what-is-pca",
+ "href": "posts/pca/index.html#what-is-pca",
+ "title": "Using PCA to Bully My Housemates (Specifically Tom)",
+ "section": "What Is PCA?",
+ "text": "What Is PCA?\n\nThe Theory\nI would have just jumped into a nice example of understanding the plots, but for the sake of completeness I will explain how PCA works. The idea of PCA is to summarise the “information” of a dataset into its principal components (PCs), and then interpret those instead. These PCs are built to be linear combinations of our variables in their most “interesting” direction. Where “interesting” means the direction of most variance. Think of a linear regression but instead of projecting our results onto a line that uses x to capture as much information as possible about y, we are using both variables trying to capture as much information as possible in the x and y direction that has the most variance. Explaining this with words is a bit difficult, so I have drawn a visualisation of this below.\n\nFollowing on from this illustration, an easy way to understand principal components is to shift your current understanding of linear regression (I’m assuming you have some current understanding of linear regression). The variable loadings are similar to variable weights in the regression line. We interpret the loadings as “how much that variable contributes to the PC”. Our prediction for a value in linear regression is its projection onto the regression line (with the error shown in the above illustration in red). When working with PCA, our observation’s values are their projection onto the PC line. It is important to note that the red lines in in the PCA drawing is not error, but rather the “remaining” value that will then be used to build the second PC. This is just a quick overview of what these values represent (if you want something more technical look at a textbook or something, this isn’t a maths class). Now, lets take a quick look at the data we are working with.\n\n\nSharehouse Chat Data\nTo put some faces (albeit badly drawn cartoon ones) to names, here is an illustration of my housemates. I have also added a fun fact (checked by them after a large amount of “is this what you think of me” arguing) to help give an idea of their personalities. I’m basically introducing them like a 2000’s MTV dating show, but hopefully this will age better and be less racist/homophobic/sexist.\n\nThe data we are going to be working with is the Facebook messenger records of the sharehouse group chat. When I downloaded it, there were about about 6000 messages, over 3000 of which were sent by me. I was originally interested in analysing all my messenger data but seeing that number stung enough for me to pretend I didn’t download my other chat files. I’d rather live in ignorance than face the fact that I feel the need to update all my friends on everything I do.\nSo, after a bit of cleaning (removing punctuation, removing stop words, breaking observations up into single words, counting the frequency by person, diving each value by total number of words said in the chat by that person) I have a my dataframe. Each variable is someone who lives in the house, each observation is a word, and the values are how many times that word was said relative to the number of words that person has sent in total. So my value for the word “tom” is how many times I have said “tom” as a fraction of all the words I have sent to the chat. I could skip making the values a “relative” frequency, but then our PCA would likely just tell us that I am absolutely incapable of shutting up, rather than what words typify each speaker. Below is a glimpse at the data that we will run through the PCA.\n\n\n\n\n\nword\nHarriet\nZac\nEm\nTom\n\n\n\n\ntom\n0.0178\n0.0171\n0.0107\n0.0081\n\n\nhouse\n0.0165\n0.0155\n0.0135\n0.0129\n\n\nzac\n0.0149\n0.0078\n0.0088\n0.0064\n\n\nem\n0.0127\n0.0194\n0.0000\n0.0081\n\n\nyeah\n0.0090\n0.0248\n0.0334\n0.0274\n\n\ntime\n0.0091\n0.0148\n0.0102\n0.0113\n\n\n2\n0.0080\n0.0054\n0.0088\n0.0064\n\n\nshit\n0.0080\n0.0062\n0.0060\n0.0000\n\n\nstuff\n0.0074\n0.0023\n0.0037\n0.0064\n\n\npeople\n0.0067\n0.0078\n0.0042\n0.0081\n\n\n\n\n\nNow that we have some data, lets discuss how we interpret the loadings."
},
{
- "objectID": "posts/boosting/index.html#part-1-focusing-on-mistakes",
- "href": "posts/boosting/index.html#part-1-focusing-on-mistakes",
- "title": "Learning Boosting Through Me Getting Fired from Tutoring",
- "section": "Part 1: Focusing on Mistakes",
- "text": "Part 1: Focusing on Mistakes\n\nIf you get 100%, You don’t need tutoring.\nThe interaction that got me fired from my tutoring company was with a kid I’ll call, Riley. After being begged to take him on as a student (he was a 5th grader and I teach high school maths) they sent me the information I needed to teach Riley. The first was an email from his teacher that read like this: Hi Mrs Riley, I’m not sure why you are getting your son tutoring considering he has no trouble in class. I have nothing he needs to work on. Maybe the tutor could teach him next semester’s content, but then he would just be bored in class so I wouldn’t recommend it.” I think, great, not only does this kid not need tutoring, but his parents are actively going against his teachers advice. Not a good sign. Next I read a note from the last tutor. “I just bring a computer game or a worksheet for him to do, and then mark it” Double great. This comment was even worse. I was clear this kid had nothing to learn, so it didn’t matter what the last tutor did with him. A tutoring session of watching a kid do things they already knows how to do with no useful feedback can go completely unnoticed. You get the most “bang for your buck” focusing on your worst areas, as they are both the areas requiring the most improvement, and are forgotten the fastest. I incorporate this attitude to every aspect of my life. You can see how in the visual below.\n\n\n\nIf you are just revising things you already know with 100% accuracy, you are not learning.\n\n\nBuilding Models in the Residual Space\nIf we build an ensemble model that is 50 models, each identical and with perfect predictions, we get the same result as if we made one. This is just wasting computational power much in the same way Riley’s family was wasting money on tutoring. In boosting, since each model is built on the residuals of previous models, it is trying to make sure that it does not repeatedly learn things it already knows. The model focuses on the most common, frequent, and damning errors, and works its way back from that. In the first animation, I let the size represent the errors, but each model is not built using the response variable, it is built using the residuals. Here, using the exact same data and model above, I have instead animated each individual tree as it tries to predict the residuals.\n\n\n\nWe can see that when we start our boosted model, the residuals are essentially our y value (since the initial prediction for the whole area is 0), and as the main model becomes more accurate, the residuals become 0, and new trees don’t have any information to contribute to the model. If the model continued much further, it would just randomly build trees on the irreducible error.\nBy focusing on the residual space, the model ensures that we aren’t wasting computations by relearning something we already know. In a similar way, the best way to learn as a human is not to revise the areas we get 100% in, but rather the areas we are failing in as they offer the most room for improvement."
+ "objectID": "posts/pca/index.html#the-loadings",
+ "href": "posts/pca/index.html#the-loadings",
+ "title": "Using PCA to Bully My Housemates (Specifically Tom)",
+ "section": "The Loadings",
+ "text": "The Loadings\n\nThe Theory\nThe loadings have two things about their interpretation that make them a bit tricky to understand: 1. We are plotting what would be on the axis of the plot in our typical scatter plot (the variables) as observations 2. We are using these “observations” to understand the axis (our PCs). I have drawn this relationship below for additional clarity.\n\nNote: these aren’t the actual loading values of PC1 and PC2 from the example below, this is just an illustration\nTo make matters even more complicated, we usually plot our PCA on a biplot with both loadings and observations. We will make and interpret this plot at the end, but since this is an “introduction to understanding PCA plots” we are going to start with only plotting the loadings, and work our way to the biplot.\nTo interpret our loadings we need to keep three things in mind: 1. The principal components summarise information in descending order of importance. This means that each PC will represent a more overt trend in the data than the PC that follow it. 2. The direction of the PCs is the most important take away. If all your loadings are in the same direction then this PC is analysing the ways in which all your variables are the same. If they move in opposite directions, the PC is identifying a juxtaposition. The actual direction of the loading (positive or negative) doesn’t matter too much outside of the loading’s direction relative to the others. This might seem a bit confusing, it will make more sense once we look at the first loading in the example below. 3. The magnitude of the loading is the least important part. If you start getting so detailed that you are thinking deeply about the magnitude, you are likely overcomplicating the problem for yourself. Just pay attention to the loadings that are significantly different from 0 (I marked these using a red line in the example).You can find your significance line as \\frac1{\\sqrt{p}} where p is the number of variables in your PCA (in the example it’s 4). As with anything, this will be easier to understand with an example, so lets just look at what the sharehouse PCA produced.\n\n\nSharehouse Chat Loadings\nTo start off with, we need to use the loadings to interpret the PCs. The first two PC’s capture most of the variance, and so typically we focus on those two, however since we only have 4 variables (and so 4 possible PCs) I might as well do them all.\n\nKeeping in mind what we covered above, we can analyse these plots. As a side note, the order of names (the x-axis of these plots) are arbitrary and organised only to make the words readable, so we only need to interpret the y-axis (the PC loadings). To begin lets start with PC1, the most important PC. Since all the loadings are negative, any persons use of a word will give that word a negative value on the first PC. To put it simply, words we say a lot as a combined group will have a large negative score, and words that we never say will sit around 0. There wont be any positive values on PC1 because each word’s value is the Housemate'sPCLoading\\times{Housemate'sWordFrequency}, summed up for all 4 of us. So since none of the words will have a negative frequency that could cancel out the negative loadings word’s wont have positive value on PC1. Here are the 4 loading interpreted in their positive direction:\nPC1: Words None of us say - The overarching ways in which the four of us are similar thanks to generation and circumstances (of living together). This PC will likely contain words people who live together and people our age use. PC2: Words Tom never says - Out of all of us, the most distinct speaker of the group is Tom. PC3: Words that Em uses - Em is the next most distinct. PC4: Words Differentiate Zac and I - Zac and I were on the same side of all the other loadings, and so once all the other sources of variance have been dealt with, this is all that is left. It makes sense, as we are the oldest and closest friends, so our speech is the most similar.\nInterestingly, the loadings captured the underlying dynamics of the group pretty well. Since the PCs are organised such that they explain decreasing variance, this tells us that the overarching patterns of speech between the 4 of us (PC1) is more salient than the difference between Tom’s and the rest of us (PC2) and so on. I have drawn the social circles identified by the PC loadings below, both as an illustration of the analysis, and to personally attack Tom. Using this understanding of our new variables (the PCs) we can interpret our observations, just as we would normal variables.\n\nAnother note I want to make is that I could have set up this data frame so that the words were the variables instead of the sender (I actually did this the first time without thinking). The main problem with this comes in the analysis. If the variables are words and the largest loadings come from “yeah”, “tom” and “house”, it is hard to understand how these words are similar, and how they are different. That analysis is much easier to do on people, because I have prior understanding of the context of those variables."
},
{
- "objectID": "posts/boosting/index.html#part-2-the-learning-rate-the-number-of-models-and-the-model-complexity",
- "href": "posts/boosting/index.html#part-2-the-learning-rate-the-number-of-models-and-the-model-complexity",
- "title": "Learning Boosting Through Me Getting Fired from Tutoring",
- "section": "Part 2: The Learning Rate, The Number of Models, and The Model Complexity",
- "text": "Part 2: The Learning Rate, The Number of Models, and The Model Complexity\n\nRash Decisions in Tutoring Is a Dangerously Simple Method\nWhen I arrive at Riley’s house, I explain I don’t have any computer games or worksheets because I disagree with them morally, however I could cover future school work and invent some fun questions. Riley’s mum was not a big fan of my moral plight to take down “big tutoring”. After a brief discussion about how “we are all a little disorganised” which everyone knows is mum code for “you are disorganised”, she sent me home. Later I received a call from my boss about being “ill-prepared” because I should have just brought computer games and worksheets like the last tutor recommended. I explained my side, and by boss was sympathetic, but I never got another tutoring job from them again. Unfortunately, due to Riley’s mum being unsupportive of trying new teaching methods, the best speed at which Riley should cover new content wont be found. He might have learnt better with longer sessions, or with another student, or doing literally anything other than playing computer games. Much in the same way that we can tailor the environment and complexity of a tutoring session, boosting can improve its predictions by changing the learning rate, number of models and the model complexity.\n\n\nTinkering the Complexity of the Boosting Model\nWhen using boosting, we need to be aware of how the learning rate (or shrinkage), the number of models and the model complexity impact our final prediction. The learning rate decides how much “predictive power” we take from each trees. Smaller learning rates need more models to get a working prediction, larger learning rates run the risk of giving too much power to outlier models, and missing minor complexities. The number of models (trees in our example) is just decided in parallel with the learning rate, and is essentially how much computational time we are willing to dedicate to our model. The depth of the tree is similar, in the sense that with enough trees, a stump tree can capture any relationship, however if we don’t have the capacity for enough models, we can increase the complexity of each individual model to add more nuance to the final prediction."
+ "objectID": "posts/pca/index.html#understanding-observations",
+ "href": "posts/pca/index.html#understanding-observations",
+ "title": "Using PCA to Bully My Housemates (Specifically Tom)",
+ "section": "Understanding Observations",
+ "text": "Understanding Observations\n\nThe Theory\nUnderstanding the observations is very straight forward once you have the PC interpretations. Usually when analysing our data, the process looks something like this:\n Variable Meaning -> Understand Observations \nFor example, a low time in a 100m sprint can be interpreted as fast. Obviously, PC1 does not have an inherent meaning to us in the same way that the time for a 100m sprint does, but that is what the loading interpretations was for. The process for understanding the data plots in PCA is:\n Construct PCs -> Use loadings to find PC meaning -> Understand Observations \nSo from this we can see that the interpretation of data in PCA vs regular analysis is almost the same, there is just an extra step (which we have already done in our example) that can complicate it a bit. Now that we understand how to interpret the observations in the PCA, let’s apply this to the sharehouse chat data to finish off the analysis.\n\n\nSharehouse Chat Observations\n\nHow do we interpret these plots? Well we need to use our interpretations of the loadings to understand what our axis represent. Since we established that PC1 represents words we all use, the distance below the line indicates how frequently the word is used between us all. For example, “yeah” and “house” are the most used words across the chat. This makes sense as we are pretty informal and all live together. We can do the same thing for PC2, which identified the ways Tom speaks differently. He uses “nbn” a lot because he is the one who set up the internet. “Tom” is a common word for Zac and I, not only because we love to bully our housemate Tom, but because we also have a few mutual friends (and some not friends) called Tom that we talk about in the chat.\nI sent all these plots to the group (I like to keep them informed) and Em said “I’m apparently the only one who laughs in this chat”. Now this brings up an interesting point in how this analysis was run, and it shows how PCA can bring out some patterns that may not be immediately recognisable in the data.\nThe data cleaning will correct for things like capitalisation (so here Here and HERE are all the same word) but if the words differ by letters (here and herez) thanks to typos or spelling mistakes, they are registered as different words. This creates a problem for registering words that I typically use, since: 1) I’m an absolutely abysmal speller and rarely spell a word the same way twice; and 2) I type laugher according to how funny I think something is (haha vs hahahahahahaha) This means, someone like Zac who almost always laughs in the same way with “lmfao”, or Em with “hahaha” and “hahahaha’, have all their chat laughter collected into one observation. Looking through the records I laugh to a similar degree, but almost all of them are recorded as unique words in the frequency count, and thus don’t make it to the analysis. Tom just never laughs at anything."
},
{
- "objectID": "posts/boosting/index.html#part-3-need-to-know-when-to-quit",
- "href": "posts/boosting/index.html#part-3-need-to-know-when-to-quit",
- "title": "Learning Boosting Through Me Getting Fired from Tutoring",
- "section": "Part 3: Need to Know When to Quit",
- "text": "Part 3: Need to Know When to Quit\n\nOverfitting in Learning\nI know someone has spent too long studying when I see forum posts asking if some obscure topic is going to be on the exam. Once you have run out of things to focus on that are important, you start to focus on the things that are less and less important, until you are sitting awake at night crying about the sheer vastness of knowledge that you could never hope to learn. Knowing when to quit is an important part of life and machine learning. Most people tell other to “try try and try again” my motto is “if you aren’t feeling it, quit”. After several years of tutoring, I was no longer feeling it, and it was time to quit. It turns out repeatedly being told “the continuity of functions doesn’t matter” and “dividing a number by 0 is 0” my soul had been crushed and I wasn’t doing my job properly any more. I had too much baggage and it was time to quit. Just like with tutoring, boosting needs to know when to quit too.\n\n\nBoosting can Overfit\nUnlike in bootstrapping, boosting has the potential to overfit. Since the later predictions are the cumulative prediction of all the models that came before, and the new models are only concerned with what those models got wrong, the overall benefit of each model is less than the model before it. This means that eventually, the tangible benefit of building a new tree becomes zero. Because of this, we always need to be aware of our ensemble complexity and manually set a stopping criteria."
+ "objectID": "posts/pca/index.html#biplot-putting-it-all-together",
+ "href": "posts/pca/index.html#biplot-putting-it-all-together",
+ "title": "Using PCA to Bully My Housemates (Specifically Tom)",
+ "section": "Biplot: Putting It All Together",
+ "text": "Biplot: Putting It All Together\nNow these plots only show one principal component each, and also don’t have the loadings on them. I started by separating the elements of the plot, but making several plots when the information could be conveyed with a single plot is tiresome. Now that we understand each of the components by themselves, lets make a biplot to show how this information is usually conveyed all together.\n\nTypically we use the first two principal components when we build the biplot because they contain the most variance, and thus the most information. This final plot is usually how a PCA is presented to us, with the observations and loadings plotted together and each axis representing a principal component. While the plot looks a little different now, the interpretations are still the same, and as a matter of fact understanding the observation is a little easier than before. Since we have the loadings on the plot too, we no longer need to hold the interpretation of the PCs in our mind to understand the observations. On the x axis, the further to the left a word is, the more we all use it, on the y-axis, the further down an observation is, the more Tom specifically uses it. Now we can make analysis of our observations using this combined information, rather than separating it. For example, looking at the biplot we can see that while “tom” is used a LOT in the chat overall, that is largely due to Zac and I, rather than Tom saying his own name.\nThe biplot allows us to summarise most of the information covered in this post in a single figure, and knowing how to interpret it makes your life much easier. That being said, if you have a lot of loadings you might still need to separate the plots as a biplot can get messy and crowded when we have too many."
},
{
- "objectID": "posts/boosting/index.html#conclusion",
- "href": "posts/boosting/index.html#conclusion",
- "title": "Learning Boosting Through Me Getting Fired from Tutoring",
+ "objectID": "posts/pca/index.html#conclusion",
+ "href": "posts/pca/index.html#conclusion",
+ "title": "Using PCA to Bully My Housemates (Specifically Tom)",
"section": "Conclusion",
- "text": "Conclusion\nBoosting employs three techniques that make it similar to effective human learning. First it focuses on mistakes, secondly it is important to tailor the complexity of any one session, and finally it need to be manually stopped or otherwise your model will stare into the abyss of the unknowable in existential dread.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
+ "text": "Conclusion\nWhile PCA plots can seem confusing at first, once you break them down into their components, they are pretty straight forward to understand. Also Zac said I need to include his twitter handle which is @zaccheus_e so I can direct people to an illiterate and poorly structured rebuttal."
},
{
- "objectID": "posts/LIME/index.html",
- "href": "posts/LIME/index.html",
- "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
+ "objectID": "contact.html",
+ "href": "contact.html",
+ "title": "Contact",
"section": "",
- "text": "When I found out baby teeth fall out, I realised the futility brushing them. The teeth are temporary, but those extra 5 minutes of playing Pokemon are forever. So I quit brushing my teeth. This wouldn’t have been too big a problem for a normal kid, but I also refused to drink water. A strangely pervasive problem in our family that started young (my brother was weaned off breast milk using chocolate Breaka) and lived into adulthood. I exclusively drank Golden Circle Raspberry Cordial, called it pink juice, carried it in my drink bottle, and I would sooner collapse from dehydration before I drank anything else. As you can imagine my teeth decayed at an alarming rate. A visit to the dentist in second grade told my parents something they were well aware of. If you let a child make their own terrible health decisions, they will cost you $10k in dental bills because apparently to a child, pain is an illusion. A lesson that should have been no surprise to them since that same year I made Mum let me slip my broken arm out of its cast to do my ballet examination, and I was still annoyed I only got a Merit. I don’t know if all kids are immune to pain and the consequences of their actions, but I certainly was. So for years I had 4 metal crowns, 13 fillings, and a sudden jolt of pain every time I accidentally got aluminium in my mouth. As an adult I leant my lesson and brush my teeth and floss twice a day. I mean I still don’t drink water, I just upgraded from Pink Juice to Pepsi Max. But I still consider a 50% improvement an inspiring story of growth.\nWhat is the point of this story? Is it related to today’s topic or has this blog become a digital diary where I decompress years of a being a child psychopath with irresponsible parents? Both. Although if my parents has a say in this blog they would probably argue they weren’t irresponsible, but rather thought the best way for us to learn was to experience the consequences of our decisions. The problem in my decision making as a child was I had too much of a focus on the long term. While it was true that the teeth were not permanent and would fall out, I still cringe at the idea of biting into metal packaging. Most methods of understanding machine learning models focus on the model as a whole, but in this post we are going to look at the local interpretation. LIME (Localised Interpretable Models) is a model interpretation method that can be applied to any machine learning algorithm, even if its a “black box” method by breaking it into smaller local models that are easy to interpret. To understand the value in this, we need to first look at the flexibility and interpretability trade off."
+ "text": "@numbats_rise_up\n Enter Education Building, we are located primarily on level 3, and to the east\n Monash University, Clayton Campus, Wellington Rd, Melbourne, 3800\n\n\nView Larger Map"
},
{
- "objectID": "posts/LIME/index.html#focus-too-much-on-the-big-picture-get-10k-in-dental-bills",
- "href": "posts/LIME/index.html#focus-too-much-on-the-big-picture-get-10k-in-dental-bills",
- "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
- "section": "",
- "text": "When I found out baby teeth fall out, I realised the futility brushing them. The teeth are temporary, but those extra 5 minutes of playing Pokemon are forever. So I quit brushing my teeth. This wouldn’t have been too big a problem for a normal kid, but I also refused to drink water. A strangely pervasive problem in our family that started young (my brother was weaned off breast milk using chocolate Breaka) and lived into adulthood. I exclusively drank Golden Circle Raspberry Cordial, called it pink juice, carried it in my drink bottle, and I would sooner collapse from dehydration before I drank anything else. As you can imagine my teeth decayed at an alarming rate. A visit to the dentist in second grade told my parents something they were well aware of. If you let a child make their own terrible health decisions, they will cost you $10k in dental bills because apparently to a child, pain is an illusion. A lesson that should have been no surprise to them since that same year I made Mum let me slip my broken arm out of its cast to do my ballet examination, and I was still annoyed I only got a Merit. I don’t know if all kids are immune to pain and the consequences of their actions, but I certainly was. So for years I had 4 metal crowns, 13 fillings, and a sudden jolt of pain every time I accidentally got aluminium in my mouth. As an adult I leant my lesson and brush my teeth and floss twice a day. I mean I still don’t drink water, I just upgraded from Pink Juice to Pepsi Max. But I still consider a 50% improvement an inspiring story of growth.\nWhat is the point of this story? Is it related to today’s topic or has this blog become a digital diary where I decompress years of a being a child psychopath with irresponsible parents? Both. Although if my parents has a say in this blog they would probably argue they weren’t irresponsible, but rather thought the best way for us to learn was to experience the consequences of our decisions. The problem in my decision making as a child was I had too much of a focus on the long term. While it was true that the teeth were not permanent and would fall out, I still cringe at the idea of biting into metal packaging. Most methods of understanding machine learning models focus on the model as a whole, but in this post we are going to look at the local interpretation. LIME (Localised Interpretable Models) is a model interpretation method that can be applied to any machine learning algorithm, even if its a “black box” method by breaking it into smaller local models that are easy to interpret. To understand the value in this, we need to first look at the flexibility and interpretability trade off."
+ "objectID": "courses.html#master-of-business-analytics",
+ "href": "courses.html#master-of-business-analytics",
+ "title": "Courses",
+ "section": "Master of Business Analytics",
+ "text": "Master of Business Analytics\nArm yourself with computational statistical tools in the face of uncertainty.\nSome of the units in the MBAt are:\n\nETC5510 Introduction to data analysis\nETC5512 Wild-caught data\nETC5521 Diving deeply into data exploration\nETC5523 Communicating with data\nETC5500 Applied forecasting\nETC5250 Introduction to machine learning\nETC5450 Advanced R programming\n\nA full list of topics for the program can be seen here"
},
{
- "objectID": "posts/LIME/index.html#the-flexibility-and-interpretability-trade-off",
- "href": "posts/LIME/index.html#the-flexibility-and-interpretability-trade-off",
- "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
- "section": "The Flexibility and Interpretability Trade Off",
- "text": "The Flexibility and Interpretability Trade Off\nI have mentioned (at length) the bias and variance trade off that comes into play when considering the flexibility of a model. What I have not mentioned, is the interpretation trade off that happens at the same time. When we “localise” our model by increasing its flexibility, allowing it to better respond to changes in variables, we also “localise” the possible interpretation. This in turn, means that a single interpretation for the entire span of the possible inputs is no longer useful. At the extreme end of this trade off, we have models in which the intermediate steps are almost impossible to understand, called “black box” models. Early statistics courses introduce flexibility with quadratic models, and deal with the trade off by splitting the area of interpretation. Sadly this is not an idea that easily lends itself easily to more complicated models, a problem I have illustrated below.\n \nAs we start to get into more complicated models our interpretation methods slightly abandon this idea of localising our understanding and instead opt for completely new techniques, like permutation variable importance which I discussed in a previous post. Instead of inventing a new way to understand our models LIME tries to make the interpretation more “localised” in the same way that flexibility “localised” the model itself."
+ "objectID": "posts/FlexiblevsInflexible/index.html",
+ "href": "posts/FlexiblevsInflexible/index.html",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "",
+ "text": "A few years ago my mum became very susceptible to suggestion, and made friends with a guy who was really good at speaking about nonsense with the authority to make it sound true. Listening to him sent her down a conspiracy theory rabbit hole, of which I had to experience second hand. Our interactions ended up boiling down to mum sending me a 20 minutes Youtube video about aliens building the pyramids, then I would wait the appropriate amount of time and send a text that said “Wow, what an interesting perspective”. I always hoped it would end the conversation and we could talk about something else, but instead it tended to inspire a paragraph long text rant about how the government was hiding free energy from us, and an 11 year old Texan genius had discovered the plot. When I think of flexible methods, I often have flash backs to that period of my life. Not because high degree polynomials were built by ancient aliens or an 11 year old genius but because we can use the pitfalls of conspiracy theories to understand the difference between flexible and inflexible methods.\n\n\nI think of flexibility as the trade off in capturing the “local” and “global” trends in our data. An inflexible model will capture the global trend of the data, but any relationship between our variables is lost. If we instead choose a flexible model, we are focusing on the local trends and giving our model a better chance at capturing variable relationships, at risk to overfit to the sample. Flexibility has key interactions with 4 other elements of our model: the sample size, dimensionality, assumptions about the function, and irreducible error."
},
{
- "objectID": "posts/LIME/index.html#how-does-it-work",
- "href": "posts/LIME/index.html#how-does-it-work",
- "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
- "section": "How does it work?",
- "text": "How does it work?\nThe main idea of LIME is the same main idea of calculus, which is if we zoom in a bunch we can approximate crazy non-linear functions with straight lines. These approximations are pretty good around that point, but get worse the further we move away. The way it works is actually quite simple and can be broken down into a handful a simple steps. 1. Make a localised dataset based on a single observation 2. Build a model on this localised dataset 3. Interpret that model. Some of the technicalities of the process change depending on the type of data we have (tabular, images or text) and I will go through each of them, but in essence, the idea is the same. I’m going to walk through trying to predict cavities based on the three types of data to illustrate the process of LIME, but keep in mind, I’m assuming we already have some trained model that is making prediction, and a set of test observations. Is crushing pain, blackening teeth, or an exclusive diet of raspberry cordial a predictor of cavities? Lets find out.\n\nTabular Data\nThe first method we are going to look at is tabular data. Lets say instead of doing normal dentistry work my dentist wants to predict if I have cavities based on how often I say I brush my teeth, and how much sugar I eat a day. This is a hypothetical world and my hypothetical doctor is omnipotent apparently. He wants to classify his patients into 3 levels based on the financial commitment they are probably about to make to his family practice. He puts my teeth brushing and sugar intake into the model, and it looks like his family should start packing their swimmers, because they are about to go to Hawaii. But how did the model come up with the prediction? In enters, LIME.\nIllustrated below (and explained in this paragraph) is the process LIME will go through to understand this classification. First we select a single point to run our model on, in this case, me or an observation very close to me. Then LIME will generate some “pretend” data around it according to independent Gaussian distributions. As a side note, this means it ignores correlation, and can generate some points that are unlikely to occur in the real data. Then LIME will run our real data point point and all its fake friends through the black box model and find their hypothetical predictions. Similar to LOESS models, the observations are then reweighted based upon their distance to to the initial(only real) data point. Remember, we aren’t trying to understand the model overall, we are only interested in the area surrounding our point. Now, on our fake, weighted data, we train an easy to interpret model. Something like a tree model or linear regression. It doesn’t have to be even slightly similar to the black box model we are analysing, all that matters is that it is a model that is simple, easy to understand and easy to explain.\n \n\n\nImages\nSo my dentist is rubbing his hands together when my mum brings me in for a check-up. Once again ignoring normal dental procedures (I’m starting to wonder if this man is a dentist or some back alley scam artist my parents dug up to teach me a lesson) the dentist decides to take a normal photo of my teeth and predict the probability I have a cavity. His picture based model also suggests cavities, but once again, how did it make that decision? LIME is back to his rescue.\nOnce again we select some observation from our dataset, in this case, a photo of my sad decaying teeth. Next, following the tabular method, we would want to create a fake dataset of similar observations, but this is where we run into our first problem. Distance is easy to see in tabular data, its our normal run of the mill Euclidean distance. But how do we define distance for pictures? What metric can we use to say how similar two pictures are. This isn’t a question LIME answers, or even tries to answer but the little algorithm that could does it’s best to work through it. On pictures, rather than making our fake observations a sample that is “close” to the observation in distance, it varies the “superpixels” of the image. Superpixels are just a group of pixels that are next to each other and look similar so they are grouped together. for example, if you had a picture of my face; my skin, hair, lips, etc. would each be their own superpixel. To make our new dataset, LIME turns random super pixels off to create our local dataset. i.e. the pixels in that group cease to exist, are changed to 0, become a black hole of lost information in the land of data. Now we have a bunch of pictures that we run through the black box model to get some cavity prediction. Once again a simple model (like a linear regression) is built using the superpixels as inputs and the probability of a cavity as an output. The image is coloured by LIME based on having a positive impact on the classification or a negative impact.\n \n\n\nText\nFinally after my dentist(?) finishes his dental(?) work, he decides to predict the chance of an expensive revisit based on my conversation with my Mum on our way out. This is a simple classification problem again and the model predicts I will be back with an expensive cavity. Finally, the dentist(??) implements LIME one more time.\nThe method for text is almost identical to the images, only instead of superpixels, it turns words off and on."
+ "objectID": "posts/FlexiblevsInflexible/index.html#a-conspiracy-theory-is-like-a-bad-model",
+ "href": "posts/FlexiblevsInflexible/index.html#a-conspiracy-theory-is-like-a-bad-model",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "",
+ "text": "A few years ago my mum became very susceptible to suggestion, and made friends with a guy who was really good at speaking about nonsense with the authority to make it sound true. Listening to him sent her down a conspiracy theory rabbit hole, of which I had to experience second hand. Our interactions ended up boiling down to mum sending me a 20 minutes Youtube video about aliens building the pyramids, then I would wait the appropriate amount of time and send a text that said “Wow, what an interesting perspective”. I always hoped it would end the conversation and we could talk about something else, but instead it tended to inspire a paragraph long text rant about how the government was hiding free energy from us, and an 11 year old Texan genius had discovered the plot. When I think of flexible methods, I often have flash backs to that period of my life. Not because high degree polynomials were built by ancient aliens or an 11 year old genius but because we can use the pitfalls of conspiracy theories to understand the difference between flexible and inflexible methods.\n\n\nI think of flexibility as the trade off in capturing the “local” and “global” trends in our data. An inflexible model will capture the global trend of the data, but any relationship between our variables is lost. If we instead choose a flexible model, we are focusing on the local trends and giving our model a better chance at capturing variable relationships, at risk to overfit to the sample. Flexibility has key interactions with 4 other elements of our model: the sample size, dimensionality, assumptions about the function, and irreducible error."
},
{
- "objectID": "posts/LIME/index.html#limitations-of-the-method",
- "href": "posts/LIME/index.html#limitations-of-the-method",
- "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
- "section": "Limitations of The Method",
- "text": "Limitations of The Method\nThe obvious problem with LIME is the same thing that made it a good idea, and the same reason some people think the earth is flat. If we zoom in too much, we lose sight of the big picture. Since our understanding is limited to single real observation from our dataset, and running it on every observation would be computationally painful, it is at our discretion which observations, and how many observations we run LIME on to understand what is under the hood of a black box model. While I only went through a general understanding of how the process works, there are other posts out there that discuss practical implementation of the model and some of the more technical aspects of how it works which are certainly worth a read."
+ "objectID": "posts/FlexiblevsInflexible/index.html#outrageous-claims-need-outrageous-evidence",
+ "href": "posts/FlexiblevsInflexible/index.html#outrageous-claims-need-outrageous-evidence",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "1: Outrageous Claims Need Outrageous Evidence",
+ "text": "1: Outrageous Claims Need Outrageous Evidence\nMy mother is a “bit eccentric” to put it mildly. In the last few months, to only name a few things, she has bought a fire truck to start mud-crabbing (pictured below), bought some goats because the garden is a pain to manage, and turned the pool into a “fish Club Med” where she collects wildlife from the local creek and feeds them McDonalds for breakfast. From expulsions to arrest warrants, to the man she drank goon with at the beach who now lives in our house, the stories are endless. Despite this, never in my life had I ever been called a liar for telling them (the first time was at university orientation). People at my school had grown used to it, they had met my family and heard years worth of stories so I had a wealth of evidence to normalise my claims. Strangers didn’t have that, and so they didn’t believe my outrageous (completely true) tales. Similarly in statistics, if we want a complicated model we will need a large sample size to back it up.\n \n\nWhy Flexible Models Need a Bigger Sample\nIn general, the larger your sample size, the more likely it is you have captured the “true relationship”. If you are increasing the number of parameters to estimate (not literally for non-parametric models but the idea carries on) without increasing the sample size, we are in effect decreasing the “sample size” for each of the estimated values, and thus decreasing the reliability of our model. Placing more weight on all the observations in calculating our estimates, means we are increasing the influence of outliers and unrepresentative samples. We can either have observations contributing to a large area but averaged over many observations, or over a small area where our estimates are averages over fewer observations. For example, if we have 10 observations and predict using the average, each observation contributes to 1/10th of the prediction, if we use 1-Nearest Neighbour, each prediction is only backed up by a single observation (illustrated below). Highly flexible models can be, and sometimes are, the appropriate choice to model a relationship, we just need a large sample to justify it. Outrageous claims need outrageous evidence."
},
{
- "objectID": "posts/LIME/index.html#in-conclusion",
- "href": "posts/LIME/index.html#in-conclusion",
- "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
- "section": "In Conclusion",
- "text": "In Conclusion\nIf something is called “pink juice” it will give you cavities, and if your dentist uses machine learning algorithms instead of normal dental practices, he might not be an actual dentist.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.— title: “Untitled” editor: visual —"
+ "objectID": "posts/FlexiblevsInflexible/index.html#the-internet---deliverer-of-facts-and-local-cult-meet-ups",
+ "href": "posts/FlexiblevsInflexible/index.html#the-internet---deliverer-of-facts-and-local-cult-meet-ups",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "2: The Internet - Deliverer of Facts and Local Cult Meet Ups",
+ "text": "2: The Internet - Deliverer of Facts and Local Cult Meet Ups\nThe introduction of the internet was the age of new information. Conspiracy theories were on their way out, now anyone can use their phone and find the facts in seconds. Or can they? What I unfortunately discovered when mum got involved with conspiracy theories, is that for every website with legitimate information, there are 50 that don’t. The sheer vastness of the internet means that whenever we expand our search for hidden truth, we are just as likely to discover falsities. This is a useful illustration in dimensionality.\n\nFlexible Models Are Hurt More By Additional Parameters\nDimensionality interacts with the flexible vs inflexible models in two ways. The first is that in some occasions adding dimensions can literally be seen as making the model more flexible. Think of adding a squared variable to a linear regression to make it quadratic, we have made the model more flexible by adding a dimension. The second way it interacts with our models, is by increasing the distance between observations, and thus the amount of input space each observations needs to be used for. To get technical, each additional parameter makes the area each observation is responsible for increase exponentially. Just like how increasing flexibility increases the “weight” of observations by localising their impact on the model, dimensionality makes the total “area” bigger, and so it does a similar thing. Sometimes the relationship between our variables needs to be modeled with a highly flexible model, and so we need to keep this interaction between flexibility and dimensionality in mind so the variance doesn’t get out of control."
},
{
- "objectID": "posts/index.html",
- "href": "posts/index.html",
- "title": "Posts",
- "section": "",
- "text": "Hackathon 2024\n\n\n\n\n\n\n\n\n\n\n\n28 May 2024\n\n\nNumbats Rise-up\n\n\n\n\n\n\n\n\n\n\n\n\nSecret Santa 2023\n\n\n\n\n\n\n\n\n\n\n\n22 November 2023\n\n\nNumbats Gathering\n\n\n\n\n\n\n\n\n\n\n\n\nReducing duplication in teaching materials\n\n\n\n\n\n\n\n\n\n\n\n19 July 2023\n\n\n\n\n\n\n\n\n\n\n\n\nHackathon 2023\n\n\n\n\n\n\n\n\n\n\n\n24 February 2023\n\n\nNumbats Rise-up\n\n\n\n\n\n\n\n\n\n\n\n\nDiving into dependen-“sea”\n\n\nHow CRAN packages are interconnected\n\n\n\n\n\n\n\n\n18 October 2022\n\n\nH. Sherry Zhang\n\n\n\n\n\n\n\n\n\n\n\n\nHow long do maps on ggplot facets take?\n\n\n\n\n\n\n\n\n\n\n\n27 May 2022\n\n\nH. Sherry Zhang\n\n\n\n\n\n\n\n\n\n\n\n\nHexmaps with sugarbag make it easier to see the electoral map\n\n\n\n\n\n\n\n\n\n\n\n21 May 2022\n\n\nDi Cook\n\n\n\n\n\n\n\n\n\n\n\n\nDo you need some analytics help? Maybe our internship program is for you!\n\n\n\n\n\n\n\n\n\n\n\n18 February 2022\n\n\nDan Simpson\n\n\n\n\n\n\n\n\n\n\n\n\nTrying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation\n\n\n\n\n\n\n\n\n\n\n\n18 December 2021\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nA Future Public Disturbance Explains LDA\n\n\n\n\n\n\n\n\n\n\n\n2 August 2021\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nUsing PCA to Bully My Housemates (Specifically Tom)\n\n\n\n\n\n\n\n\n\n\n\n19 April 2021\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nCan our Masters students help you?\n\n\n\n\n\n\n\n\n\n\n\n5 March 2021\n\n\nRob J Hyndman\n\n\n\n\n\n\n\n\n\n\n\n\nLearning Boosting Through Me Getting Fired from Tutoring\n\n\n\n\n\n\n\n\n\n\n\n3 January 2021\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nBaby Teeth are Temporary, Model Interpretability is Forever\n\n\n\n\n\n\n\n\n\n\n\n13 October 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nHow a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)\n\n\n\n\n\n\n\n\n\n\n\n6 October 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nCriminal Statistics in Baby Murder Court Cases\n\n\n\n\n\n\n\n\n\n\n\n14 September 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\n4 Things We Can Learn About Conspiracy Theories and Model Flexibility\n\n\n\n\n\n\n\n\n\n\n\n11 August 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nMario Party: Destroyer of Friendships and Explainer of Convolutional Neural Networks\n\n\n\n\n\n\n\n\n\n\n\n11 August 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nMy Idiot Brain Ruined My School’s NAPLAN average, But Bootstrapping Could Have Saved it\n\n\n\n\n\n\n\n\n\n\n\n4 August 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nUsing the Bachelor to Understand Permutation Variable Importance\n\n\n\n\n\n\n\n\n\n\n\n29 July 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nA Deep Dive into How Flexibility Affects The Bias and Variance Trade Off\n\n\n\n\n\n\n\n\n\n\n\n20 July 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nNUMBAT Hackathon 2020\n\n\nCoding together\n\n\n\n\n\n\n\n\n19 February 2020\n\n\nDi Cook\n\n\n\n\n\n\nNo matching items"
+ "objectID": "posts/FlexiblevsInflexible/index.html#capitalism---the-gateway-conspiracy-to-lizard-people",
+ "href": "posts/FlexiblevsInflexible/index.html#capitalism---the-gateway-conspiracy-to-lizard-people",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "3: Capitalism - The Gateway Conspiracy to Lizard People",
+ "text": "3: Capitalism - The Gateway Conspiracy to Lizard People\nNobody suddenly wakes up in the morning, looks in the mirror and says to themselves “Yes, today is the day. Today is the day I start believing in the lizard overlords.” I believe the process is more nuanced than that. Just like the “SayNoToPeerPressure” acting troupe who’s dreams I got to watch die in the comfort of my high school gym, I’m about to push the idea of gateways. From my personal experience, the process of becoming involved in conspiracies looks a little something like this:\n \nMy point is that ideas that hinge on something already well established in society are easier to swallow than those that aren’t. That is not to say entirely new theories must be wrong, but rather that they are harder for people to immediately understand and they are also more likely to be too out there for the general population to get on board with. I think of parametric and non-parametric models in a very similar way to how people think of capitalism vs lizard people conspiracy theories.\n\nNon-Parametric Models Are Usually More Flexible, But Not Always\nParametric models construct our function by assuming its type, and then estimating the best model within this range. Non-parametric models do not make any assumptions about our model’s form, but rather try to fit to the general shape of the data. Parametric and Non-parametric does not directly translate to flexibility; they both have the potential to produce a very flexible or inflexible fit. For example, a constant polynomial and a K-NN model where K=N would both predict the average response (the most inflexible model we can get). Rather, just like dimensionality, non-parametric models can fall into the same pitfalls as flexibility, and so the limits of our dataset should be kept in mind. By their nature, non-parametric models are more susceptible to variance from changes in the sample, as the sample is the only thing the model is using to make its predictions. Therefore, they are more likely to overfitting than parametric models and are usually more difficult to interpret. These features mean that in general non-parametric models are more flexible, simply by their nature, however they are still have the potential to be inflexible."
},
{
- "objectID": "posts/LDA/index.html",
- "href": "posts/LDA/index.html",
- "title": "A Future Public Disturbance Explains LDA",
- "section": "",
- "text": "I would like to say that I’m the type of person who would never complain at a restaurant, but deep down I know I’m one bad day away from being the woman in a viral video pissing on the floor of a Starbucks because they didn’t give her soy milk. If you would like to see the correlation that makes me think this, see the figure below:\n\n\n\nDespite this clear character flaw which I refuse to talk about further, I have never written an online review of a restaurant. I came close earlier this year when my sisters and I decided to spend money we didn’t have on some fine dining, thinking it would be a nice night out. Unsurprisingly from the tone of this post, the experience turned out to be a nightmare. When our waiter eventually asked if we were OK I looked at her with a manic glint in my eyes and told her I was silently praying that something from the ceiling would fall and crush me, so I could escape the eternal purgatory of being trapped in this restaurant. Spending all my money to get the opportunity to wait hours for an onslaught of terrible food, was probably an indicator I had already died and this was some personalized version of hell. Here is a snippet of a review I got halfway through writing, and then forgot about until this very moment.\n\n\n\nYou could consider my complaints to be wildly privilege and out of touch, but my dog died a year ago, and I would rather relive that than go to this restaurant again. If you are thinking to yourself that I seem to have unhinged emotional responses to slight upsets, you would be correct. So while we are on the topic of out of touch emotional responses, lets talk about Linear Discriminant Analysis (LDA).\nBack when I learnt about LDA, I had a regularly maintained page in my notes called “LDA is ruining my life”. Every tutorial for about 4 weeks would be me pointing to a section of the page and asking my tutor why all of it worked the way it did, and why I was too stupid to understand it. Ultimately my issue stemmed from one key question: Is LDA a classification or dimension reduction technique; and if its both, how are they related? I never figured it out, and after the exam, I decided it was something for people greater than myself to know. Or I decided it was not my problem any more. The distinction is unimportant. What is important is that a few weeks ago I overheard someone in the department talking about LDA and I had what I can only describe as machine learning based war flashbacks. So, thanks to this conversation, I reopened the page that hurt me to my soul, and made some plots to help me (and by extension you, the person reading this post) finally understand how LDA works. I’m going to break this down into two sections:\n\nHow LDA classification works in the 1-dimensional case\nHow LDA dimension reduction works in the 2-dimensional case and then extends to classification.\n\nFor the running example, we are going to look at some restaurant reviews to maintain the theme of “things Harriet has disproportionate emotional reactions to”. If anyone was looking for a sign that I’m running out of ideas, here it is."
+ "objectID": "posts/FlexiblevsInflexible/index.html#there-are-always-going-to-be-loonies-on-the-internet",
+ "href": "posts/FlexiblevsInflexible/index.html#there-are-always-going-to-be-loonies-on-the-internet",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "4: There are Always Going to Be Loonies on the Internet",
+ "text": "4: There are Always Going to Be Loonies on the Internet\nWe can all spend our entire lives trying to convince everyone on the internet that they are wrong, but at the end of the day, we live in a complicated world, with complicated people, and there are always going to be loonies on the internet. Rather than dreaming of a world where everyone knows everything all the time, the system should just be to manage the chaos. The important life skill to learn isn’t that everyone needs to be corrected, and to focus on the nutters, but rather enjoy the fact that the majority get most things right, most of the time. Socrates might disagree with my idea on majority votes but you win some, you lose some.\n\nYou Will Always Have Irreducible Error and It’s Size Matters\nObviously we can never have a perfect prediction since we are working with random variables. We can make our models more flexible to try and account for as much of the error as we can, but if we do, we might end up missing the underlying system entirely. No matter how flexible our model is, we will never have perfection thanks to our irreducible error (an attempt at making one is illustrated below). The interaction between flexibility and irreducible error comes from its size. A large irreducible error means the general shape change more drastically between samples, while a small one means our samples will remain consistent. Just like dimensionality, assumptions about our model, and sample size, this is just something that needs to be kept in mind as it has a strong interaction with the flexibility of our model, and the error from variance."
},
{
- "objectID": "posts/LDA/index.html#theory",
- "href": "posts/LDA/index.html#theory",
- "title": "A Future Public Disturbance Explains LDA",
- "section": "Theory",
- "text": "Theory\nLDA is pretty straight forward as far as classification models go. Every time we use a classification model, we are implicitly asking “Which of these groups is more probable?”, LDA just does this very literally. If you are unfamiliar with Bayes theorem (that would be alarming but I’m an adult who said “Who’s Cyprus?” to my housemate the other day so I can hardly judge) it looks like this:\nP(Y=k|X=x) = \\frac{P(K=k)P(X=x|K=k)}{P(X=x)}\nIf you don’t have an intuitive sense of Bayes theorem, its actually pretty easy to draw how it works. Lets say we have two groups, and we want to find the probability that an observation belongs to either Class 1 or Class 2 based on our predictors. Since there are two versions of the function (one for Class 1 and one for Class 2), we will have two different possible equations to plot, and so two different densities. To start with, LDA assumes that both classes are normally distributed and have the same variance, so we only need to calculate three things, each of the group means and their shared variance. Once we have these values we can draw the associated normal density, that is P(X=x|K=k) for each value of K.\n\n\n\nThis is already looking pretty good, but what if each class is not equally probable? Well, we can make these densities more accurate by scaling them by their relative class probability, i.e. P(K=k). So lets say that class 2 is much more likely than class 1, then we end up with this:\n\n\n\nThen to finish we just scale both of them down so that they follow the probability axioms. That is, we make sure the probability of belonging to Class 1 + the probability of belonging to Class 2 is not not greater than 1.\n\n\n\nWith that we end up at the equation for Bayes theorem. Unfortunately this theorem does not give us a classification rule, but rather relative probabilities. To make a rule we could calculate this probability for every class, and then classify our observation to whichever class spits out the largest value of this function, but that is a bit tedious. It would be much easier to have a single value where numbers above it are Class 2 and numbers below are Class 1. Lucky for us, the y-axis here gives a probability, which means for values of x where the Class 2 function is higher, Class 2 is more probable, and vice-versa for Class 1. Therefore, the best place to draw our boundary is when the densities overlap, and both classes are equally probable.\n\n\n\nThis the basics how LDA classifies observations. The only thing to note, is that it doesn’t do this on the scale of your X variable. It will scale X by some constant such that the classification bound drawn above is at 0, and this new scale is called the Linear Discriminant. To make this a little easier to understand, I can give a real example of 1-Dimensional LDA classification."
+ "objectID": "posts/FlexiblevsInflexible/index.html#to-conclude",
+ "href": "posts/FlexiblevsInflexible/index.html#to-conclude",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "To Conclude",
+ "text": "To Conclude\nDon’t let your mum hang out with weirdos, and treat conspiracy theories and overly complicated models with scepticism."
},
{
- "objectID": "posts/LDA/index.html#example",
- "href": "posts/LDA/index.html#example",
- "title": "A Future Public Disturbance Explains LDA",
- "section": "Example",
- "text": "Example\nWhile it would be in character for me to sit alone in my room and write hundreds of spiteful restaurant reviews to make a data set, I’m not going to do that. For this analysis I’m going to use a Kaggle data set with about 1000 restaurant reviews, where each observations has two variables; the typed review and a yes/no indicator for whether or not the person liked the food.\nUnfortunately if we want to predict if someone liked the food based off their review, the raw version of this data isn’t going to cut it. We need some comparable variables, which means we need a measure of how positive or negative the review is. For this, I’m going to keep it simple, and use a basic sentiment analysis. For those who don’t know, there are several ways to assess the sentiment of a word, i.e. if it has positive or negative feelings associated with it. There are a number of “lexicons” that are just massive data sets that have a word (such as “hate” or “love”) and an associated score for that words sentiment (-5 or +5). I’m going to use two of these for this example; ‘AFINN’ which gives words a score from -5 to 5, and ‘bing’ which rates words as either positive or negative (simplified to 1 or -1 here). To use these as variables in our example data set, I took the average sentiment of the words in the review for that lexicon. Finally we get a dataset with observations that look like this:\n\n\n\n\n\n\n\n\n\n\n\n\n\nReview\nliked\nafinn_sentiment\nbing_sentiment\n\n\n\n\n22\nThere is not a deal good enough that would drag me into that establishment again.\nNo\n0.1333\n0.0667\n\n\n34\nA great way to finish a great.\nYes\n0.8571\n0.2857\n\n\n94\nThis place should honestly be blown up.\nNo\n0.0000\n0.0000\n\n\n103\nTo my disbelief, each dish qualified as the worst version of these foods I have ever tasted.\nNo\n-0.1765\n-0.0588\n\n\n199\nSo they performed.\nYes\n0.0000\n0.0000\n\n\n\n\n\nNote: These reviews are some of my favourites, they are not a random sample.\nRight now, I’m not going to use both of these sentiments. We are still in the one dimensional classification case, so lets stick to the AFINN average. To see how this variable splits the reviews, lets make a density plot of the data similar to the one explained above.\n\n\n\n\n\n\n\n\n\nFrom this plot we can see that these distributions are neither normal, nor have similar variance. You will rarely have data that behaves exactly according to the assumptions. Does this mean we can’t use LDA? Not really, it just means that if (or when) we end up with a boundary that is a bit different to what you would draw yourself, it is probably because of these incorrect assumptions.\nIf we perform LDA on this data, we can find the coefficient of linear discriminants. Earlier I said that LDA will perform classification on its own scale, and this is how we find it. This value is used the same way it would be in a regression function, where it gives us the coefficients in the formula:\nx = 2.929\\times AverageSentiment_{AFINN} Where x is the “linear discriminant” for that review. We can plot this variable (although it is just afinn_sentiment scaled by a constant) with the decision bound at 0 to see what our linear discriminant space looks like:\n\n\n\n\n\n\n\n\n\nIt looks exactly the same except now the border for a 50% threshold is at 0. You may notice this is not where the densities overlap, and that is because LDA has assumed that distribution for “No” reviews is more spread out than it is. While it isn’t perfect, with this example under our belts we can move onto using LDA for dimension reduction."
+ "objectID": "posts/bootstrapCI/index.html",
+ "href": "posts/bootstrapCI/index.html",
+ "title": "My Idiot Brain Ruined My School’s NAPLAN average, But Bootstrapping Could Have Saved it",
+ "section": "",
+ "text": "Creative writing has never been my forte, however, no attempt was worse than my 5th Grade NAPLAN test. My score was so poor I suspect the examiners were concerned I would never turn out to be a functional adult capable of basic literacy. Unfortunately for my school, typical sample metrics like averages and standard deviation can be heavily skewed by a single student who thinks spelling is a waste of time, and writing out the plot of last nights fever dream makes for good literature. This issue for my school could have been fixed if, rather than using typical sample statistics, the NAPLAN examiners employed bootstrapping.\n\nWhat is Bootstrapping?\nBootstrapping is a versatile tool that can be used to find estimates of variance when they aren’t easily found analytically. This could be due to it being an uncommon statistic, or due to sampling from a strange distribution. Essentially, we sample from our dataset, with replacement, to make a new dataset that is the same size, as illustrated below. It works similarly to simulation, and understanding where simulation and bootstrapping diverge make the limitations and applications easier to understand.\n\n\n\n\n\nSimulation and Bootstrapping\nFor those of us in statistics, simulations are a useful tool and not just something your pseudo-intellectual friend brings up after watching a Neil deGrasse Tyson documentary. When an analytical solution is unavailable to us (or if the math isn’t worth it) a simple estimate with a simulation can go a long way. Bootstrapping is closely related to simulation and outlining their similarities and differences makes it easier to understand the main ideas of bootstrapping.\nIf our data creation process was forced into the context of a family tree, simulated data would be the parent of the bootstrap sample. A simulation of a simulation if you will. If we illustrate this, we need to start with our random process, the true data generating process that we assumes exists but can never truly know. This process creates our data (or simulation if we are working artificially) which turn creates our bootstrapped samples.\n\n\n\nFor a simulation, each observation is an independent trial generated from the true data generating process. As we generate more and more of these simulations, their behaviour (and thus statistics) will on average approach the true distribution. So we can average them to make estimates for the underlying process.\n\n\n\nBootstrapping is a little different but follows a similar idea. Instead of the source being a generating function, it is one of the simulations. To replicate the effect of independent trials, we sample with replacement from the data and generate a new dataset. This new dataset is essentially a simulation of that original simulation. Much in the way that simulation takes one step back and looks at the true relationship, bootstrapping takes one step back and can be used to estimate statistics in the data.\n\n\n\n\n\nThe Limitations of Bootstrapping\nBootstrapping is a simple trick that allows us to fabricate new samples from our data to understand the variance of our statistic. This process, however, only allows us to take one step back in the chain of simulation. Bootstrapping from a dataset only estimates the statistics from that dataset, going further and making estimates for the true underlying process would require additional natural data. More data through bootstrapping won’t cut it.\nHere you may throw up your hands at the futility of trying to estimate anything as you stare in the mirror and ask your refection why you chose statistics instead of following your childhood dream of running an ice cream shop. Lucky for us, data analysis evolved from mathematics with the key goal to estimate the impossible, using an age old technique called lowering our standards.\nReally the underlying data generating process is more of an idea than something you can touch. So the limitation isn’t that limiting. If you just change your goal to understanding the dataset, an endeavour that is more practical really, bootstrapping works just fine.\n\n\nEstimating Confidence Intervals\nHow can we use bootstrapping for something helpful, like estimating a confidence interval? Well, the process is surprisingly simple, and we have already done most of the work. After you have taken your bootstrapped samples and estimated your statistic of interest for each, you simply order them and take the values that lie on the percentile boundary. That’s it. You want a 90% confidence interval with 100 bootstrap samples? Well take the 100 bootstrapped sample statistics, order them, and the value range between the 5th and 95th observations is your 90% confidence interval. It’s that simple.\n\n\n\n\n\nUsing my Poor English Grades For an Example\nWhen I was a child, I did so badly in the year 5 NAPLAN writing that the school insisted on moving me from advanced, to remedial English. My paper, as my teacher explained to my mother, seemed to be a summarised plot of the anime “Full Metal Alchemist”, written in one sentence that spanned several pages, with no punctuation, and mostly incorrectly spelled words. This was coming in hot after my older brother, three years my senior, had also failed a large portion of his NAPLAN tests because he was “too busy thinking about his imaginary Pokémon” to answer the questions, and randomly filled in bubbles in the last 5 minutes. We are going to use my attack on the advanced class average as an example in bootstrapping.\nLets say the advanced class and the regular class both have 5 students each (they didn’t but drawing those stick people gets tiring). All these students have an associated test score, and the staff want to compare the two classes. Simply using the mean of the two classes will cause issues, since the mean is susceptible to be skewed by outliers, and my idiot paper is a big outlier.\n\n\n\n\n\n\nThe bootstrap comes to our aid here in two ways. First of all, finding the average of several bootstrapped samples can help with eliminating problems caused out outliers. In this case, since the sample size is only 5, that doesn’t quite work here, but with a sample size that isn’t limited by how many stick figures I can draw, it’s quite useful. Instead, we want to use the median to compare the two classes, since outliers don’t affect it. The problem with the median, is that unlike the mean, it doesn’t have any nice theorems that give us its variance. This is the second way bootstrapping can come to our aid, to create confidence intervals for our median.\nTo create each bootstrap sample, we will randomly sample our data with replacement 5 times. An example of what one of the bootstrapped samples would look like is shown below.\n\n\n\nIf we repeat this 100 times, we get 100 medians, which we can sort in ascending order, and get a confidence interval for the median. Using this method, we have managed to save my classmates from being dragged down by my terrible paper.\n\n\n\n\n\nA Final Note\nIn the end, bootstrapping is a useful and versatile tool, that can help us when we are using less conventional statistics or have an unconventional distribution. Unlike simulation, bootstrapping isn’t generating new data, but rather creating a new simulation from our current data, so the conclusions we can draw aren’t limitless. One place it could be useful, however, is saving the people around me from my moments of stupidity that drag them down to my nearly illiterate level.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
},
{
- "objectID": "posts/LDA/index.html#theory-1",
- "href": "posts/LDA/index.html#theory-1",
- "title": "A Future Public Disturbance Explains LDA",
- "section": "Theory",
- "text": "Theory\nNow we move onto the dimension reduction aspect of LDA. Remember how in the previous example LDA changed the original variable to make its own scale, LD1? Well, that is the dimension reduction part working, although in that case it wasn’t reducing the dimensionality, just scaling it. LDA dimension reduction is actually very similar to Principal Component Analysis (PCA) dimension reduction, in that it uses eigen-decomposition. To avoid going into some complicated details you can find elsewhere, whenever you read eigen-decomposition just think “we found direction of largest variance, represented it as a line, and projected data onto that line” and you know enough to make it through this post.\nIn my last post, I explained eigen-decomposition and how it is utilized in PCA, but it is much easier to understand when you to see it working. The animation below depicts a scatter plot of data, with a line that rotates around its centre. This line represents a potential direction for us to project our data onto (i.e. a 1-Dimensional representation). When we did PCA, we wanted to pick the line that has the highest variance, that is, had the fattest distribution when plotted as a density plot, and this animation lets us roughly see when that point is:\n\n \n\nThe point of this is visually show you the value of eigen decomposition. It gives an instant analytical result for what we could try to find with the animation above.\nHow is this relevant to LDA? Well it also uses an eigen decomposition, but it isn’t as straight forward as the PCA case. Now we have classes that we need to take into account. LDA decomposition works with the goal of minimising the overlap between classes. That is, if we plot the density of the two groups, we want the smallest possible overlap, as shown below.\n\n\n\nThere are two components of the distributions that come into play when picking this direction: 1) Distance of Class Means. The importance of this bit is pretty obvious. If the two groups are further apart then they have less overlap. Regardless of the features of the data, this will always be an important aspect. 1) Class Variance and Covariance. While having data that is far away is nice, if there is another projection that brings the class means closer, but makes the resulting distribution steeper, it will result in less overlap. Finding a balance between these two components is what allows us to maximise the distance between the two groups, and find the best 1-dimensional projection of the classes. The interaction of these two elements may not be easy to see above, but it will make a lot of sense with some animated examples. In these examples I have added a value called “VarDist” in the corner, I will get to the calculation of this value later, but for right now, know that it represents the interaction of these two components and we are looking to maximise it.\nTo start, lets take the variance and covariance of the data completely out of the equation and look at a simple case where our variables have no correlation and our classes are just two bivariate normals with different means.\n\n \n\nThe animation shows two distributions that change in central location, but their shape stays the same. Since the scatter plot shape of each group is essentially circular (due to the 0 correlation of the variables), no matter what direction we project the data into the shape (and therefore variance) will be the same. This means we can ignore the variance and focus on maximising the distance between the means. This is achieved by projecting the data on the line that goes through the two group averages. Moving on from this simple example, lets make things more interesting and look at some data where the variables are correlated.\n\n \n\nNow we can see two forces at play. Just as before, when the line goes through the two means the data is most separated, but this is no longer the only factor we need to consider. The positive correlation means that the direction we project the data onto can now also flatten or steepen the curve. We can no longer use the line that goes through the two means, because if another direction brings the distributions closer, but also significantly decreases the spread, that would be the preferable option. We can see this in example with negative correlation too.\n\n \n\nNow that we have seen how this works intuitively, we can go through how this is calculated. So, how does LDA perform a decomposition that accounts for these two competing elements? It combines two matrices in the eigen-decomposition, the variance covariance matrix and a matrix of the between group averages.\nFirst we want to minimise the within class variance of the projected data. The first important thing to note is that by the assumptions of LDA, all the classes have identical variance-covariance matrices. Therefore to calculate the matrix, we get the variance-covariance matrix of each isolated class, and then average them. The averaging shouldn’t change the values (if your LDA assumptions are correct), it should just makes the estimate more accurate. This is illustrated in the picture below.\n\n\n\nNow that we have this matrix, how do we find the projection that minimises the variance instead of maximises? We just perform an eigen-decomposition on the inverse of the matrix. Now that we have taken care of the spread element of LDA, we can take care of the “separating the means” element. For this we create another matrix for the “between group differences”.\nThis is just a matrix representation of the distance between the classes which is constructed using the method illustrated below. Much like with the eigen-decomposition of the variance-covariance matrix, how this matrix works is not of major significance.\n\n\n\nSince we are trying to maximise this, we do the eigen decomposition on the matrix. Finally, to get the direction we are projecting our data onto, we need to just take the eigen decomposition of the combination of these two matrices, that is the matrix \\Sigma_V^{-1}\\Sigma_B.\nIn this case, what is the “VarDist” value that appears in the plots?. Well, when you do an eigen-decomposition it analytically finds the direction that maximises the distance between groups, but instead of solving it analytically, we could also solve it iteratively and just check the product of the group variance and between group difference in the 1 dimensional projection for a series of projections and select the one that maximises this value. That is the value that is shown in the corner of the animated plots and its calculation is shown in the formula below:\nVarDist=({\\frac{(n_1-1)s_1^2+(n_2-1)s_2^2}{n_1+n_2}})^{-1} \\times \\sum_{i=1}^2(\\bar{x_k}-\\bar{x})^2\nWith this information, it should be clear how we get the 1-dimensional representation that best separates the two classes. While a technical understanding is fine, it is easier to see this come together with an example."
+ "objectID": "posts/hackathon_2020/index.html#overview",
+ "href": "posts/hackathon_2020/index.html#overview",
+ "title": "NUMBAT Hackathon 2020",
+ "section": "Overview",
+ "text": "Overview\nThe second ever NUMBAT hackathon was held at Abbotsford Convent Feb 19-20, 2020. A hackathon is style like RopenSci events where attendees brainstorm potential projects, and join to gether in small groups to tackle a subset of these."
},
{
- "objectID": "posts/LDA/index.html#example-1",
- "href": "posts/LDA/index.html#example-1",
- "title": "A Future Public Disturbance Explains LDA",
- "section": "Example",
- "text": "Example\nFollowing on from our example before, let’s add in a second variable, the bing sentiment measure. Now that we have two variables we can plot them together on a scatterplot to see their relation.\n\n\n\n\n\n\n\n\n\nThis plot gives us a few things to note. First of all, this is clearly not two bivariate normal densities, as LDA will assume. Again, this just means our results will be slightly off. In addition to this there are a lot of 0 values for both the AFINN and bing sentiments. This can occur in two circumstances: 1) none of the words in the review appear in that lexicon because they are all neutral (“I’m uncertain I could call that food”), or 2) if the sentiment scores of the words in the review cancel each other out (e.g. “The food was so good I felt bad eating it”). Since this only impacts the assumptions of LDA we are going to power through.\nJust like with the classification, we can perform LDA on this dataset and get a formula that calculates the Linear Discriminant for each review. In this example our coefficients give a function that looks like this:\n x = 0.800\\times AverageSentiment_{AFINN} + 5.576\\times Average Sentiment_{bing} Now, instead of the linear discriminant just scaling the variables so the 50/50 probability split is at 0, it is a linear combination of the two variables. How did it get these coefficients? Using the eigen-decomposition described above. Again, we can calculate the linear discriminant of each review, and in doing so, have a 1-dimensional projection of the data. Now that we have two variables, just like with PCA we can interpret their coefficient values. If the values are on the same scale (or you scaled them prior to performing your analysis) then this is a simple step of comparison. Unfortunately I did not do that (I wanted it to be clear which sentiment was which by the scale or I was being lazy, you pick) so instead we can plot it on the previous scatter plot and comment on the steepness of the slope. Below is a plot of the data with the line it is projected onto, as well as the resulting density.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSince the slope is not steep, we can see that AFINN sentiment contributes more to the dimension reduction than bing does, and is therefore more important when it comes to separating the groups. This is how we interpret the dimension reduction. You can see that I added the decision boundary in too. Once we have a single dimension that our data exist on, the classification is exactly as it was above, drawing the line where the two densities overlap (or not quite when we violate the assumptions)."
+ "objectID": "posts/hackathon_2020/index.html#projects",
+ "href": "posts/hackathon_2020/index.html#projects",
+ "title": "NUMBAT Hackathon 2020",
+ "section": "Projects",
+ "text": "Projects\nPlanned projects can be found at https://github.com/numbats/numbathackathon/issues. (In closed issues, too.)"
},
{
"objectID": "posts/PopulationvsSample/index.html",
@@ -539,354 +567,333 @@
"text": "In Conclusion\nBoth statisticians and legal prosecutors make mistakes all the time, that being said, sometimes they were avoidable. If we simply take a step back, and assessed the limits of our conclusions we could stop many overconfident predictions, and wrongly accused mothers. Before doing any kind of analysis there are a handful of questions we should ask ourselves. What is my data a sample of? Is my sample large enough to capture what I’m investigating? Is my model making ridiculous assumptions about my variables? Is murder REALLY the most likely reason for a fragile baby to die? Ok, maybe less of the last question for statistics, but the other three for sure."
},
{
- "objectID": "posts/hackathon_2020/index.html#overview",
- "href": "posts/hackathon_2020/index.html#overview",
- "title": "NUMBAT Hackathon 2020",
- "section": "Overview",
- "text": "Overview\nThe second ever NUMBAT hackathon was held at Abbotsford Convent Feb 19-20, 2020. A hackathon is style like RopenSci events where attendees brainstorm potential projects, and join to gether in small groups to tackle a subset of these."
+ "objectID": "posts/LDA/index.html",
+ "href": "posts/LDA/index.html",
+ "title": "A Future Public Disturbance Explains LDA",
+ "section": "",
+ "text": "I would like to say that I’m the type of person who would never complain at a restaurant, but deep down I know I’m one bad day away from being the woman in a viral video pissing on the floor of a Starbucks because they didn’t give her soy milk. If you would like to see the correlation that makes me think this, see the figure below:\n\n\n\nDespite this clear character flaw which I refuse to talk about further, I have never written an online review of a restaurant. I came close earlier this year when my sisters and I decided to spend money we didn’t have on some fine dining, thinking it would be a nice night out. Unsurprisingly from the tone of this post, the experience turned out to be a nightmare. When our waiter eventually asked if we were OK I looked at her with a manic glint in my eyes and told her I was silently praying that something from the ceiling would fall and crush me, so I could escape the eternal purgatory of being trapped in this restaurant. Spending all my money to get the opportunity to wait hours for an onslaught of terrible food, was probably an indicator I had already died and this was some personalized version of hell. Here is a snippet of a review I got halfway through writing, and then forgot about until this very moment.\n\n\n\nYou could consider my complaints to be wildly privilege and out of touch, but my dog died a year ago, and I would rather relive that than go to this restaurant again. If you are thinking to yourself that I seem to have unhinged emotional responses to slight upsets, you would be correct. So while we are on the topic of out of touch emotional responses, lets talk about Linear Discriminant Analysis (LDA).\nBack when I learnt about LDA, I had a regularly maintained page in my notes called “LDA is ruining my life”. Every tutorial for about 4 weeks would be me pointing to a section of the page and asking my tutor why all of it worked the way it did, and why I was too stupid to understand it. Ultimately my issue stemmed from one key question: Is LDA a classification or dimension reduction technique; and if its both, how are they related? I never figured it out, and after the exam, I decided it was something for people greater than myself to know. Or I decided it was not my problem any more. The distinction is unimportant. What is important is that a few weeks ago I overheard someone in the department talking about LDA and I had what I can only describe as machine learning based war flashbacks. So, thanks to this conversation, I reopened the page that hurt me to my soul, and made some plots to help me (and by extension you, the person reading this post) finally understand how LDA works. I’m going to break this down into two sections:\n\nHow LDA classification works in the 1-dimensional case\nHow LDA dimension reduction works in the 2-dimensional case and then extends to classification.\n\nFor the running example, we are going to look at some restaurant reviews to maintain the theme of “things Harriet has disproportionate emotional reactions to”. If anyone was looking for a sign that I’m running out of ideas, here it is."
},
{
- "objectID": "posts/hackathon_2020/index.html#projects",
- "href": "posts/hackathon_2020/index.html#projects",
- "title": "NUMBAT Hackathon 2020",
- "section": "Projects",
- "text": "Projects\nPlanned projects can be found at https://github.com/numbats/numbathackathon/issues. (In closed issues, too.)"
+ "objectID": "posts/LDA/index.html#theory",
+ "href": "posts/LDA/index.html#theory",
+ "title": "A Future Public Disturbance Explains LDA",
+ "section": "Theory",
+ "text": "Theory\nLDA is pretty straight forward as far as classification models go. Every time we use a classification model, we are implicitly asking “Which of these groups is more probable?”, LDA just does this very literally. If you are unfamiliar with Bayes theorem (that would be alarming but I’m an adult who said “Who’s Cyprus?” to my housemate the other day so I can hardly judge) it looks like this:\nP(Y=k|X=x) = \\frac{P(K=k)P(X=x|K=k)}{P(X=x)}\nIf you don’t have an intuitive sense of Bayes theorem, its actually pretty easy to draw how it works. Lets say we have two groups, and we want to find the probability that an observation belongs to either Class 1 or Class 2 based on our predictors. Since there are two versions of the function (one for Class 1 and one for Class 2), we will have two different possible equations to plot, and so two different densities. To start with, LDA assumes that both classes are normally distributed and have the same variance, so we only need to calculate three things, each of the group means and their shared variance. Once we have these values we can draw the associated normal density, that is P(X=x|K=k) for each value of K.\n\n\n\nThis is already looking pretty good, but what if each class is not equally probable? Well, we can make these densities more accurate by scaling them by their relative class probability, i.e. P(K=k). So lets say that class 2 is much more likely than class 1, then we end up with this:\n\n\n\nThen to finish we just scale both of them down so that they follow the probability axioms. That is, we make sure the probability of belonging to Class 1 + the probability of belonging to Class 2 is not not greater than 1.\n\n\n\nWith that we end up at the equation for Bayes theorem. Unfortunately this theorem does not give us a classification rule, but rather relative probabilities. To make a rule we could calculate this probability for every class, and then classify our observation to whichever class spits out the largest value of this function, but that is a bit tedious. It would be much easier to have a single value where numbers above it are Class 2 and numbers below are Class 1. Lucky for us, the y-axis here gives a probability, which means for values of x where the Class 2 function is higher, Class 2 is more probable, and vice-versa for Class 1. Therefore, the best place to draw our boundary is when the densities overlap, and both classes are equally probable.\n\n\n\nThis the basics how LDA classifies observations. The only thing to note, is that it doesn’t do this on the scale of your X variable. It will scale X by some constant such that the classification bound drawn above is at 0, and this new scale is called the Linear Discriminant. To make this a little easier to understand, I can give a real example of 1-Dimensional LDA classification."
},
{
- "objectID": "posts/bootstrapCI/index.html",
- "href": "posts/bootstrapCI/index.html",
- "title": "My Idiot Brain Ruined My School’s NAPLAN average, But Bootstrapping Could Have Saved it",
- "section": "",
- "text": "Creative writing has never been my forte, however, no attempt was worse than my 5th Grade NAPLAN test. My score was so poor I suspect the examiners were concerned I would never turn out to be a functional adult capable of basic literacy. Unfortunately for my school, typical sample metrics like averages and standard deviation can be heavily skewed by a single student who thinks spelling is a waste of time, and writing out the plot of last nights fever dream makes for good literature. This issue for my school could have been fixed if, rather than using typical sample statistics, the NAPLAN examiners employed bootstrapping.\n\nWhat is Bootstrapping?\nBootstrapping is a versatile tool that can be used to find estimates of variance when they aren’t easily found analytically. This could be due to it being an uncommon statistic, or due to sampling from a strange distribution. Essentially, we sample from our dataset, with replacement, to make a new dataset that is the same size, as illustrated below. It works similarly to simulation, and understanding where simulation and bootstrapping diverge make the limitations and applications easier to understand.\n\n\n\n\n\nSimulation and Bootstrapping\nFor those of us in statistics, simulations are a useful tool and not just something your pseudo-intellectual friend brings up after watching a Neil deGrasse Tyson documentary. When an analytical solution is unavailable to us (or if the math isn’t worth it) a simple estimate with a simulation can go a long way. Bootstrapping is closely related to simulation and outlining their similarities and differences makes it easier to understand the main ideas of bootstrapping.\nIf our data creation process was forced into the context of a family tree, simulated data would be the parent of the bootstrap sample. A simulation of a simulation if you will. If we illustrate this, we need to start with our random process, the true data generating process that we assumes exists but can never truly know. This process creates our data (or simulation if we are working artificially) which turn creates our bootstrapped samples.\n\n\n\nFor a simulation, each observation is an independent trial generated from the true data generating process. As we generate more and more of these simulations, their behaviour (and thus statistics) will on average approach the true distribution. So we can average them to make estimates for the underlying process.\n\n\n\nBootstrapping is a little different but follows a similar idea. Instead of the source being a generating function, it is one of the simulations. To replicate the effect of independent trials, we sample with replacement from the data and generate a new dataset. This new dataset is essentially a simulation of that original simulation. Much in the way that simulation takes one step back and looks at the true relationship, bootstrapping takes one step back and can be used to estimate statistics in the data.\n\n\n\n\n\nThe Limitations of Bootstrapping\nBootstrapping is a simple trick that allows us to fabricate new samples from our data to understand the variance of our statistic. This process, however, only allows us to take one step back in the chain of simulation. Bootstrapping from a dataset only estimates the statistics from that dataset, going further and making estimates for the true underlying process would require additional natural data. More data through bootstrapping won’t cut it.\nHere you may throw up your hands at the futility of trying to estimate anything as you stare in the mirror and ask your refection why you chose statistics instead of following your childhood dream of running an ice cream shop. Lucky for us, data analysis evolved from mathematics with the key goal to estimate the impossible, using an age old technique called lowering our standards.\nReally the underlying data generating process is more of an idea than something you can touch. So the limitation isn’t that limiting. If you just change your goal to understanding the dataset, an endeavour that is more practical really, bootstrapping works just fine.\n\n\nEstimating Confidence Intervals\nHow can we use bootstrapping for something helpful, like estimating a confidence interval? Well, the process is surprisingly simple, and we have already done most of the work. After you have taken your bootstrapped samples and estimated your statistic of interest for each, you simply order them and take the values that lie on the percentile boundary. That’s it. You want a 90% confidence interval with 100 bootstrap samples? Well take the 100 bootstrapped sample statistics, order them, and the value range between the 5th and 95th observations is your 90% confidence interval. It’s that simple.\n\n\n\n\n\nUsing my Poor English Grades For an Example\nWhen I was a child, I did so badly in the year 5 NAPLAN writing that the school insisted on moving me from advanced, to remedial English. My paper, as my teacher explained to my mother, seemed to be a summarised plot of the anime “Full Metal Alchemist”, written in one sentence that spanned several pages, with no punctuation, and mostly incorrectly spelled words. This was coming in hot after my older brother, three years my senior, had also failed a large portion of his NAPLAN tests because he was “too busy thinking about his imaginary Pokémon” to answer the questions, and randomly filled in bubbles in the last 5 minutes. We are going to use my attack on the advanced class average as an example in bootstrapping.\nLets say the advanced class and the regular class both have 5 students each (they didn’t but drawing those stick people gets tiring). All these students have an associated test score, and the staff want to compare the two classes. Simply using the mean of the two classes will cause issues, since the mean is susceptible to be skewed by outliers, and my idiot paper is a big outlier.\n\n\n\n\n\n\nThe bootstrap comes to our aid here in two ways. First of all, finding the average of several bootstrapped samples can help with eliminating problems caused out outliers. In this case, since the sample size is only 5, that doesn’t quite work here, but with a sample size that isn’t limited by how many stick figures I can draw, it’s quite useful. Instead, we want to use the median to compare the two classes, since outliers don’t affect it. The problem with the median, is that unlike the mean, it doesn’t have any nice theorems that give us its variance. This is the second way bootstrapping can come to our aid, to create confidence intervals for our median.\nTo create each bootstrap sample, we will randomly sample our data with replacement 5 times. An example of what one of the bootstrapped samples would look like is shown below.\n\n\n\nIf we repeat this 100 times, we get 100 medians, which we can sort in ascending order, and get a confidence interval for the median. Using this method, we have managed to save my classmates from being dragged down by my terrible paper.\n\n\n\n\n\nA Final Note\nIn the end, bootstrapping is a useful and versatile tool, that can help us when we are using less conventional statistics or have an unconventional distribution. Unlike simulation, bootstrapping isn’t generating new data, but rather creating a new simulation from our current data, so the conclusions we can draw aren’t limitless. One place it could be useful, however, is saving the people around me from my moments of stupidity that drag them down to my nearly illiterate level.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
+ "objectID": "posts/LDA/index.html#example",
+ "href": "posts/LDA/index.html#example",
+ "title": "A Future Public Disturbance Explains LDA",
+ "section": "Example",
+ "text": "Example\nWhile it would be in character for me to sit alone in my room and write hundreds of spiteful restaurant reviews to make a data set, I’m not going to do that. For this analysis I’m going to use a Kaggle data set with about 1000 restaurant reviews, where each observations has two variables; the typed review and a yes/no indicator for whether or not the person liked the food.\nUnfortunately if we want to predict if someone liked the food based off their review, the raw version of this data isn’t going to cut it. We need some comparable variables, which means we need a measure of how positive or negative the review is. For this, I’m going to keep it simple, and use a basic sentiment analysis. For those who don’t know, there are several ways to assess the sentiment of a word, i.e. if it has positive or negative feelings associated with it. There are a number of “lexicons” that are just massive data sets that have a word (such as “hate” or “love”) and an associated score for that words sentiment (-5 or +5). I’m going to use two of these for this example; ‘AFINN’ which gives words a score from -5 to 5, and ‘bing’ which rates words as either positive or negative (simplified to 1 or -1 here). To use these as variables in our example data set, I took the average sentiment of the words in the review for that lexicon. Finally we get a dataset with observations that look like this:\n\n\n\n\n\n\n\n\n\n\n\n\n\nReview\nliked\nafinn_sentiment\nbing_sentiment\n\n\n\n\n22\nThere is not a deal good enough that would drag me into that establishment again.\nNo\n0.1333\n0.0667\n\n\n34\nA great way to finish a great.\nYes\n0.8571\n0.2857\n\n\n94\nThis place should honestly be blown up.\nNo\n0.0000\n0.0000\n\n\n103\nTo my disbelief, each dish qualified as the worst version of these foods I have ever tasted.\nNo\n-0.1765\n-0.0588\n\n\n199\nSo they performed.\nYes\n0.0000\n0.0000\n\n\n\n\n\nNote: These reviews are some of my favourites, they are not a random sample.\nRight now, I’m not going to use both of these sentiments. We are still in the one dimensional classification case, so lets stick to the AFINN average. To see how this variable splits the reviews, lets make a density plot of the data similar to the one explained above.\n\n\n\n\n\n\n\n\n\nFrom this plot we can see that these distributions are neither normal, nor have similar variance. You will rarely have data that behaves exactly according to the assumptions. Does this mean we can’t use LDA? Not really, it just means that if (or when) we end up with a boundary that is a bit different to what you would draw yourself, it is probably because of these incorrect assumptions.\nIf we perform LDA on this data, we can find the coefficient of linear discriminants. Earlier I said that LDA will perform classification on its own scale, and this is how we find it. This value is used the same way it would be in a regression function, where it gives us the coefficients in the formula:\nx = 2.929\\times AverageSentiment_{AFINN} Where x is the “linear discriminant” for that review. We can plot this variable (although it is just afinn_sentiment scaled by a constant) with the decision bound at 0 to see what our linear discriminant space looks like:\n\n\n\n\n\n\n\n\n\nIt looks exactly the same except now the border for a 50% threshold is at 0. You may notice this is not where the densities overlap, and that is because LDA has assumed that distribution for “No” reviews is more spread out than it is. While it isn’t perfect, with this example under our belts we can move onto using LDA for dimension reduction."
},
{
- "objectID": "posts/FlexiblevsInflexible/index.html",
- "href": "posts/FlexiblevsInflexible/index.html",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "",
- "text": "A few years ago my mum became very susceptible to suggestion, and made friends with a guy who was really good at speaking about nonsense with the authority to make it sound true. Listening to him sent her down a conspiracy theory rabbit hole, of which I had to experience second hand. Our interactions ended up boiling down to mum sending me a 20 minutes Youtube video about aliens building the pyramids, then I would wait the appropriate amount of time and send a text that said “Wow, what an interesting perspective”. I always hoped it would end the conversation and we could talk about something else, but instead it tended to inspire a paragraph long text rant about how the government was hiding free energy from us, and an 11 year old Texan genius had discovered the plot. When I think of flexible methods, I often have flash backs to that period of my life. Not because high degree polynomials were built by ancient aliens or an 11 year old genius but because we can use the pitfalls of conspiracy theories to understand the difference between flexible and inflexible methods.\n\n\nI think of flexibility as the trade off in capturing the “local” and “global” trends in our data. An inflexible model will capture the global trend of the data, but any relationship between our variables is lost. If we instead choose a flexible model, we are focusing on the local trends and giving our model a better chance at capturing variable relationships, at risk to overfit to the sample. Flexibility has key interactions with 4 other elements of our model: the sample size, dimensionality, assumptions about the function, and irreducible error."
+ "objectID": "posts/LDA/index.html#theory-1",
+ "href": "posts/LDA/index.html#theory-1",
+ "title": "A Future Public Disturbance Explains LDA",
+ "section": "Theory",
+ "text": "Theory\nNow we move onto the dimension reduction aspect of LDA. Remember how in the previous example LDA changed the original variable to make its own scale, LD1? Well, that is the dimension reduction part working, although in that case it wasn’t reducing the dimensionality, just scaling it. LDA dimension reduction is actually very similar to Principal Component Analysis (PCA) dimension reduction, in that it uses eigen-decomposition. To avoid going into some complicated details you can find elsewhere, whenever you read eigen-decomposition just think “we found direction of largest variance, represented it as a line, and projected data onto that line” and you know enough to make it through this post.\nIn my last post, I explained eigen-decomposition and how it is utilized in PCA, but it is much easier to understand when you to see it working. The animation below depicts a scatter plot of data, with a line that rotates around its centre. This line represents a potential direction for us to project our data onto (i.e. a 1-Dimensional representation). When we did PCA, we wanted to pick the line that has the highest variance, that is, had the fattest distribution when plotted as a density plot, and this animation lets us roughly see when that point is:\n\n \n\nThe point of this is visually show you the value of eigen decomposition. It gives an instant analytical result for what we could try to find with the animation above.\nHow is this relevant to LDA? Well it also uses an eigen decomposition, but it isn’t as straight forward as the PCA case. Now we have classes that we need to take into account. LDA decomposition works with the goal of minimising the overlap between classes. That is, if we plot the density of the two groups, we want the smallest possible overlap, as shown below.\n\n\n\nThere are two components of the distributions that come into play when picking this direction: 1) Distance of Class Means. The importance of this bit is pretty obvious. If the two groups are further apart then they have less overlap. Regardless of the features of the data, this will always be an important aspect. 1) Class Variance and Covariance. While having data that is far away is nice, if there is another projection that brings the class means closer, but makes the resulting distribution steeper, it will result in less overlap. Finding a balance between these two components is what allows us to maximise the distance between the two groups, and find the best 1-dimensional projection of the classes. The interaction of these two elements may not be easy to see above, but it will make a lot of sense with some animated examples. In these examples I have added a value called “VarDist” in the corner, I will get to the calculation of this value later, but for right now, know that it represents the interaction of these two components and we are looking to maximise it.\nTo start, lets take the variance and covariance of the data completely out of the equation and look at a simple case where our variables have no correlation and our classes are just two bivariate normals with different means.\n\n \n\nThe animation shows two distributions that change in central location, but their shape stays the same. Since the scatter plot shape of each group is essentially circular (due to the 0 correlation of the variables), no matter what direction we project the data into the shape (and therefore variance) will be the same. This means we can ignore the variance and focus on maximising the distance between the means. This is achieved by projecting the data on the line that goes through the two group averages. Moving on from this simple example, lets make things more interesting and look at some data where the variables are correlated.\n\n \n\nNow we can see two forces at play. Just as before, when the line goes through the two means the data is most separated, but this is no longer the only factor we need to consider. The positive correlation means that the direction we project the data onto can now also flatten or steepen the curve. We can no longer use the line that goes through the two means, because if another direction brings the distributions closer, but also significantly decreases the spread, that would be the preferable option. We can see this in example with negative correlation too.\n\n \n\nNow that we have seen how this works intuitively, we can go through how this is calculated. So, how does LDA perform a decomposition that accounts for these two competing elements? It combines two matrices in the eigen-decomposition, the variance covariance matrix and a matrix of the between group averages.\nFirst we want to minimise the within class variance of the projected data. The first important thing to note is that by the assumptions of LDA, all the classes have identical variance-covariance matrices. Therefore to calculate the matrix, we get the variance-covariance matrix of each isolated class, and then average them. The averaging shouldn’t change the values (if your LDA assumptions are correct), it should just makes the estimate more accurate. This is illustrated in the picture below.\n\n\n\nNow that we have this matrix, how do we find the projection that minimises the variance instead of maximises? We just perform an eigen-decomposition on the inverse of the matrix. Now that we have taken care of the spread element of LDA, we can take care of the “separating the means” element. For this we create another matrix for the “between group differences”.\nThis is just a matrix representation of the distance between the classes which is constructed using the method illustrated below. Much like with the eigen-decomposition of the variance-covariance matrix, how this matrix works is not of major significance.\n\n\n\nSince we are trying to maximise this, we do the eigen decomposition on the matrix. Finally, to get the direction we are projecting our data onto, we need to just take the eigen decomposition of the combination of these two matrices, that is the matrix \\Sigma_V^{-1}\\Sigma_B.\nIn this case, what is the “VarDist” value that appears in the plots?. Well, when you do an eigen-decomposition it analytically finds the direction that maximises the distance between groups, but instead of solving it analytically, we could also solve it iteratively and just check the product of the group variance and between group difference in the 1 dimensional projection for a series of projections and select the one that maximises this value. That is the value that is shown in the corner of the animated plots and its calculation is shown in the formula below:\nVarDist=({\\frac{(n_1-1)s_1^2+(n_2-1)s_2^2}{n_1+n_2}})^{-1} \\times \\sum_{i=1}^2(\\bar{x_k}-\\bar{x})^2\nWith this information, it should be clear how we get the 1-dimensional representation that best separates the two classes. While a technical understanding is fine, it is easier to see this come together with an example."
},
{
- "objectID": "posts/FlexiblevsInflexible/index.html#a-conspiracy-theory-is-like-a-bad-model",
- "href": "posts/FlexiblevsInflexible/index.html#a-conspiracy-theory-is-like-a-bad-model",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "",
- "text": "A few years ago my mum became very susceptible to suggestion, and made friends with a guy who was really good at speaking about nonsense with the authority to make it sound true. Listening to him sent her down a conspiracy theory rabbit hole, of which I had to experience second hand. Our interactions ended up boiling down to mum sending me a 20 minutes Youtube video about aliens building the pyramids, then I would wait the appropriate amount of time and send a text that said “Wow, what an interesting perspective”. I always hoped it would end the conversation and we could talk about something else, but instead it tended to inspire a paragraph long text rant about how the government was hiding free energy from us, and an 11 year old Texan genius had discovered the plot. When I think of flexible methods, I often have flash backs to that period of my life. Not because high degree polynomials were built by ancient aliens or an 11 year old genius but because we can use the pitfalls of conspiracy theories to understand the difference between flexible and inflexible methods.\n\n\nI think of flexibility as the trade off in capturing the “local” and “global” trends in our data. An inflexible model will capture the global trend of the data, but any relationship between our variables is lost. If we instead choose a flexible model, we are focusing on the local trends and giving our model a better chance at capturing variable relationships, at risk to overfit to the sample. Flexibility has key interactions with 4 other elements of our model: the sample size, dimensionality, assumptions about the function, and irreducible error."
+ "objectID": "posts/LDA/index.html#example-1",
+ "href": "posts/LDA/index.html#example-1",
+ "title": "A Future Public Disturbance Explains LDA",
+ "section": "Example",
+ "text": "Example\nFollowing on from our example before, let’s add in a second variable, the bing sentiment measure. Now that we have two variables we can plot them together on a scatterplot to see their relation.\n\n\n\n\n\n\n\n\n\nThis plot gives us a few things to note. First of all, this is clearly not two bivariate normal densities, as LDA will assume. Again, this just means our results will be slightly off. In addition to this there are a lot of 0 values for both the AFINN and bing sentiments. This can occur in two circumstances: 1) none of the words in the review appear in that lexicon because they are all neutral (“I’m uncertain I could call that food”), or 2) if the sentiment scores of the words in the review cancel each other out (e.g. “The food was so good I felt bad eating it”). Since this only impacts the assumptions of LDA we are going to power through.\nJust like with the classification, we can perform LDA on this dataset and get a formula that calculates the Linear Discriminant for each review. In this example our coefficients give a function that looks like this:\n x = 0.800\\times AverageSentiment_{AFINN} + 5.576\\times Average Sentiment_{bing} Now, instead of the linear discriminant just scaling the variables so the 50/50 probability split is at 0, it is a linear combination of the two variables. How did it get these coefficients? Using the eigen-decomposition described above. Again, we can calculate the linear discriminant of each review, and in doing so, have a 1-dimensional projection of the data. Now that we have two variables, just like with PCA we can interpret their coefficient values. If the values are on the same scale (or you scaled them prior to performing your analysis) then this is a simple step of comparison. Unfortunately I did not do that (I wanted it to be clear which sentiment was which by the scale or I was being lazy, you pick) so instead we can plot it on the previous scatter plot and comment on the steepness of the slope. Below is a plot of the data with the line it is projected onto, as well as the resulting density.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSince the slope is not steep, we can see that AFINN sentiment contributes more to the dimension reduction than bing does, and is therefore more important when it comes to separating the groups. This is how we interpret the dimension reduction. You can see that I added the decision boundary in too. Once we have a single dimension that our data exist on, the classification is exactly as it was above, drawing the line where the two densities overlap (or not quite when we violate the assumptions)."
},
{
- "objectID": "posts/FlexiblevsInflexible/index.html#outrageous-claims-need-outrageous-evidence",
- "href": "posts/FlexiblevsInflexible/index.html#outrageous-claims-need-outrageous-evidence",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "1: Outrageous Claims Need Outrageous Evidence",
- "text": "1: Outrageous Claims Need Outrageous Evidence\nMy mother is a “bit eccentric” to put it mildly. In the last few months, to only name a few things, she has bought a fire truck to start mud-crabbing (pictured below), bought some goats because the garden is a pain to manage, and turned the pool into a “fish Club Med” where she collects wildlife from the local creek and feeds them McDonalds for breakfast. From expulsions to arrest warrants, to the man she drank goon with at the beach who now lives in our house, the stories are endless. Despite this, never in my life had I ever been called a liar for telling them (the first time was at university orientation). People at my school had grown used to it, they had met my family and heard years worth of stories so I had a wealth of evidence to normalise my claims. Strangers didn’t have that, and so they didn’t believe my outrageous (completely true) tales. Similarly in statistics, if we want a complicated model we will need a large sample size to back it up.\n \n\nWhy Flexible Models Need a Bigger Sample\nIn general, the larger your sample size, the more likely it is you have captured the “true relationship”. If you are increasing the number of parameters to estimate (not literally for non-parametric models but the idea carries on) without increasing the sample size, we are in effect decreasing the “sample size” for each of the estimated values, and thus decreasing the reliability of our model. Placing more weight on all the observations in calculating our estimates, means we are increasing the influence of outliers and unrepresentative samples. We can either have observations contributing to a large area but averaged over many observations, or over a small area where our estimates are averages over fewer observations. For example, if we have 10 observations and predict using the average, each observation contributes to 1/10th of the prediction, if we use 1-Nearest Neighbour, each prediction is only backed up by a single observation (illustrated below). Highly flexible models can be, and sometimes are, the appropriate choice to model a relationship, we just need a large sample to justify it. Outrageous claims need outrageous evidence."
+ "objectID": "posts/index.html",
+ "href": "posts/index.html",
+ "title": "Posts",
+ "section": "",
+ "text": "Hackathon 2024\n\n\n\n\n\n\n\n\n\n\n\n28 May 2024\n\n\nNumbats Rise-up\n\n\n\n\n\n\n\n\n\n\n\n\nSecret Santa 2023\n\n\n\n\n\n\n\n\n\n\n\n22 November 2023\n\n\nNumbats Gathering\n\n\n\n\n\n\n\n\n\n\n\n\nReducing duplication in teaching materials\n\n\n\n\n\n\n\n\n\n\n\n19 July 2023\n\n\n\n\n\n\n\n\n\n\n\n\nHackathon 2023\n\n\n\n\n\n\n\n\n\n\n\n24 February 2023\n\n\nNumbats Rise-up\n\n\n\n\n\n\n\n\n\n\n\n\nDiving into dependen-“sea”\n\n\nHow CRAN packages are interconnected\n\n\n\n\n\n\n\n\n18 October 2022\n\n\nH. Sherry Zhang\n\n\n\n\n\n\n\n\n\n\n\n\nHow long do maps on ggplot facets take?\n\n\n\n\n\n\n\n\n\n\n\n27 May 2022\n\n\nH. Sherry Zhang\n\n\n\n\n\n\n\n\n\n\n\n\nHexmaps with sugarbag make it easier to see the electoral map\n\n\n\n\n\n\n\n\n\n\n\n21 May 2022\n\n\nDi Cook\n\n\n\n\n\n\n\n\n\n\n\n\nDo you need some analytics help? Maybe our internship program is for you!\n\n\n\n\n\n\n\n\n\n\n\n18 February 2022\n\n\nDan Simpson\n\n\n\n\n\n\n\n\n\n\n\n\nTrying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation\n\n\n\n\n\n\n\n\n\n\n\n18 December 2021\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nA Future Public Disturbance Explains LDA\n\n\n\n\n\n\n\n\n\n\n\n2 August 2021\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nUsing PCA to Bully My Housemates (Specifically Tom)\n\n\n\n\n\n\n\n\n\n\n\n19 April 2021\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nCan our Masters students help you?\n\n\n\n\n\n\n\n\n\n\n\n5 March 2021\n\n\nRob J Hyndman\n\n\n\n\n\n\n\n\n\n\n\n\nLearning Boosting Through Me Getting Fired from Tutoring\n\n\n\n\n\n\n\n\n\n\n\n3 January 2021\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nBaby Teeth are Temporary, Model Interpretability is Forever\n\n\n\n\n\n\n\n\n\n\n\n13 October 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nHow a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)\n\n\n\n\n\n\n\n\n\n\n\n6 October 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nCriminal Statistics in Baby Murder Court Cases\n\n\n\n\n\n\n\n\n\n\n\n14 September 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\n4 Things We Can Learn About Conspiracy Theories and Model Flexibility\n\n\n\n\n\n\n\n\n\n\n\n11 August 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nMario Party: Destroyer of Friendships and Explainer of Convolutional Neural Networks\n\n\n\n\n\n\n\n\n\n\n\n11 August 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nMy Idiot Brain Ruined My School’s NAPLAN average, But Bootstrapping Could Have Saved it\n\n\n\n\n\n\n\n\n\n\n\n4 August 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nUsing the Bachelor to Understand Permutation Variable Importance\n\n\n\n\n\n\n\n\n\n\n\n29 July 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nA Deep Dive into How Flexibility Affects The Bias and Variance Trade Off\n\n\n\n\n\n\n\n\n\n\n\n20 July 2020\n\n\nHarriet Mason\n\n\n\n\n\n\n\n\n\n\n\n\nNUMBAT Hackathon 2020\n\n\nCoding together\n\n\n\n\n\n\n\n\n19 February 2020\n\n\nDi Cook\n\n\n\n\n\n\nNo matching items"
},
{
- "objectID": "posts/FlexiblevsInflexible/index.html#the-internet---deliverer-of-facts-and-local-cult-meet-ups",
- "href": "posts/FlexiblevsInflexible/index.html#the-internet---deliverer-of-facts-and-local-cult-meet-ups",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "2: The Internet - Deliverer of Facts and Local Cult Meet Ups",
- "text": "2: The Internet - Deliverer of Facts and Local Cult Meet Ups\nThe introduction of the internet was the age of new information. Conspiracy theories were on their way out, now anyone can use their phone and find the facts in seconds. Or can they? What I unfortunately discovered when mum got involved with conspiracy theories, is that for every website with legitimate information, there are 50 that don’t. The sheer vastness of the internet means that whenever we expand our search for hidden truth, we are just as likely to discover falsities. This is a useful illustration in dimensionality.\n\nFlexible Models Are Hurt More By Additional Parameters\nDimensionality interacts with the flexible vs inflexible models in two ways. The first is that in some occasions adding dimensions can literally be seen as making the model more flexible. Think of adding a squared variable to a linear regression to make it quadratic, we have made the model more flexible by adding a dimension. The second way it interacts with our models, is by increasing the distance between observations, and thus the amount of input space each observations needs to be used for. To get technical, each additional parameter makes the area each observation is responsible for increase exponentially. Just like how increasing flexibility increases the “weight” of observations by localising their impact on the model, dimensionality makes the total “area” bigger, and so it does a similar thing. Sometimes the relationship between our variables needs to be modeled with a highly flexible model, and so we need to keep this interaction between flexibility and dimensionality in mind so the variance doesn’t get out of control."
+ "objectID": "posts/LIME/index.html",
+ "href": "posts/LIME/index.html",
+ "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
+ "section": "",
+ "text": "When I found out baby teeth fall out, I realised the futility brushing them. The teeth are temporary, but those extra 5 minutes of playing Pokemon are forever. So I quit brushing my teeth. This wouldn’t have been too big a problem for a normal kid, but I also refused to drink water. A strangely pervasive problem in our family that started young (my brother was weaned off breast milk using chocolate Breaka) and lived into adulthood. I exclusively drank Golden Circle Raspberry Cordial, called it pink juice, carried it in my drink bottle, and I would sooner collapse from dehydration before I drank anything else. As you can imagine my teeth decayed at an alarming rate. A visit to the dentist in second grade told my parents something they were well aware of. If you let a child make their own terrible health decisions, they will cost you $10k in dental bills because apparently to a child, pain is an illusion. A lesson that should have been no surprise to them since that same year I made Mum let me slip my broken arm out of its cast to do my ballet examination, and I was still annoyed I only got a Merit. I don’t know if all kids are immune to pain and the consequences of their actions, but I certainly was. So for years I had 4 metal crowns, 13 fillings, and a sudden jolt of pain every time I accidentally got aluminium in my mouth. As an adult I leant my lesson and brush my teeth and floss twice a day. I mean I still don’t drink water, I just upgraded from Pink Juice to Pepsi Max. But I still consider a 50% improvement an inspiring story of growth.\nWhat is the point of this story? Is it related to today’s topic or has this blog become a digital diary where I decompress years of a being a child psychopath with irresponsible parents? Both. Although if my parents has a say in this blog they would probably argue they weren’t irresponsible, but rather thought the best way for us to learn was to experience the consequences of our decisions. The problem in my decision making as a child was I had too much of a focus on the long term. While it was true that the teeth were not permanent and would fall out, I still cringe at the idea of biting into metal packaging. Most methods of understanding machine learning models focus on the model as a whole, but in this post we are going to look at the local interpretation. LIME (Localised Interpretable Models) is a model interpretation method that can be applied to any machine learning algorithm, even if its a “black box” method by breaking it into smaller local models that are easy to interpret. To understand the value in this, we need to first look at the flexibility and interpretability trade off."
},
{
- "objectID": "posts/FlexiblevsInflexible/index.html#capitalism---the-gateway-conspiracy-to-lizard-people",
- "href": "posts/FlexiblevsInflexible/index.html#capitalism---the-gateway-conspiracy-to-lizard-people",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "3: Capitalism - The Gateway Conspiracy to Lizard People",
- "text": "3: Capitalism - The Gateway Conspiracy to Lizard People\nNobody suddenly wakes up in the morning, looks in the mirror and says to themselves “Yes, today is the day. Today is the day I start believing in the lizard overlords.” I believe the process is more nuanced than that. Just like the “SayNoToPeerPressure” acting troupe who’s dreams I got to watch die in the comfort of my high school gym, I’m about to push the idea of gateways. From my personal experience, the process of becoming involved in conspiracies looks a little something like this:\n \nMy point is that ideas that hinge on something already well established in society are easier to swallow than those that aren’t. That is not to say entirely new theories must be wrong, but rather that they are harder for people to immediately understand and they are also more likely to be too out there for the general population to get on board with. I think of parametric and non-parametric models in a very similar way to how people think of capitalism vs lizard people conspiracy theories.\n\nNon-Parametric Models Are Usually More Flexible, But Not Always\nParametric models construct our function by assuming its type, and then estimating the best model within this range. Non-parametric models do not make any assumptions about our model’s form, but rather try to fit to the general shape of the data. Parametric and Non-parametric does not directly translate to flexibility; they both have the potential to produce a very flexible or inflexible fit. For example, a constant polynomial and a K-NN model where K=N would both predict the average response (the most inflexible model we can get). Rather, just like dimensionality, non-parametric models can fall into the same pitfalls as flexibility, and so the limits of our dataset should be kept in mind. By their nature, non-parametric models are more susceptible to variance from changes in the sample, as the sample is the only thing the model is using to make its predictions. Therefore, they are more likely to overfitting than parametric models and are usually more difficult to interpret. These features mean that in general non-parametric models are more flexible, simply by their nature, however they are still have the potential to be inflexible."
+ "objectID": "posts/LIME/index.html#focus-too-much-on-the-big-picture-get-10k-in-dental-bills",
+ "href": "posts/LIME/index.html#focus-too-much-on-the-big-picture-get-10k-in-dental-bills",
+ "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
+ "section": "",
+ "text": "When I found out baby teeth fall out, I realised the futility brushing them. The teeth are temporary, but those extra 5 minutes of playing Pokemon are forever. So I quit brushing my teeth. This wouldn’t have been too big a problem for a normal kid, but I also refused to drink water. A strangely pervasive problem in our family that started young (my brother was weaned off breast milk using chocolate Breaka) and lived into adulthood. I exclusively drank Golden Circle Raspberry Cordial, called it pink juice, carried it in my drink bottle, and I would sooner collapse from dehydration before I drank anything else. As you can imagine my teeth decayed at an alarming rate. A visit to the dentist in second grade told my parents something they were well aware of. If you let a child make their own terrible health decisions, they will cost you $10k in dental bills because apparently to a child, pain is an illusion. A lesson that should have been no surprise to them since that same year I made Mum let me slip my broken arm out of its cast to do my ballet examination, and I was still annoyed I only got a Merit. I don’t know if all kids are immune to pain and the consequences of their actions, but I certainly was. So for years I had 4 metal crowns, 13 fillings, and a sudden jolt of pain every time I accidentally got aluminium in my mouth. As an adult I leant my lesson and brush my teeth and floss twice a day. I mean I still don’t drink water, I just upgraded from Pink Juice to Pepsi Max. But I still consider a 50% improvement an inspiring story of growth.\nWhat is the point of this story? Is it related to today’s topic or has this blog become a digital diary where I decompress years of a being a child psychopath with irresponsible parents? Both. Although if my parents has a say in this blog they would probably argue they weren’t irresponsible, but rather thought the best way for us to learn was to experience the consequences of our decisions. The problem in my decision making as a child was I had too much of a focus on the long term. While it was true that the teeth were not permanent and would fall out, I still cringe at the idea of biting into metal packaging. Most methods of understanding machine learning models focus on the model as a whole, but in this post we are going to look at the local interpretation. LIME (Localised Interpretable Models) is a model interpretation method that can be applied to any machine learning algorithm, even if its a “black box” method by breaking it into smaller local models that are easy to interpret. To understand the value in this, we need to first look at the flexibility and interpretability trade off."
},
{
- "objectID": "posts/FlexiblevsInflexible/index.html#there-are-always-going-to-be-loonies-on-the-internet",
- "href": "posts/FlexiblevsInflexible/index.html#there-are-always-going-to-be-loonies-on-the-internet",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "4: There are Always Going to Be Loonies on the Internet",
- "text": "4: There are Always Going to Be Loonies on the Internet\nWe can all spend our entire lives trying to convince everyone on the internet that they are wrong, but at the end of the day, we live in a complicated world, with complicated people, and there are always going to be loonies on the internet. Rather than dreaming of a world where everyone knows everything all the time, the system should just be to manage the chaos. The important life skill to learn isn’t that everyone needs to be corrected, and to focus on the nutters, but rather enjoy the fact that the majority get most things right, most of the time. Socrates might disagree with my idea on majority votes but you win some, you lose some.\n\nYou Will Always Have Irreducible Error and It’s Size Matters\nObviously we can never have a perfect prediction since we are working with random variables. We can make our models more flexible to try and account for as much of the error as we can, but if we do, we might end up missing the underlying system entirely. No matter how flexible our model is, we will never have perfection thanks to our irreducible error (an attempt at making one is illustrated below). The interaction between flexibility and irreducible error comes from its size. A large irreducible error means the general shape change more drastically between samples, while a small one means our samples will remain consistent. Just like dimensionality, assumptions about our model, and sample size, this is just something that needs to be kept in mind as it has a strong interaction with the flexibility of our model, and the error from variance."
+ "objectID": "posts/LIME/index.html#the-flexibility-and-interpretability-trade-off",
+ "href": "posts/LIME/index.html#the-flexibility-and-interpretability-trade-off",
+ "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
+ "section": "The Flexibility and Interpretability Trade Off",
+ "text": "The Flexibility and Interpretability Trade Off\nI have mentioned (at length) the bias and variance trade off that comes into play when considering the flexibility of a model. What I have not mentioned, is the interpretation trade off that happens at the same time. When we “localise” our model by increasing its flexibility, allowing it to better respond to changes in variables, we also “localise” the possible interpretation. This in turn, means that a single interpretation for the entire span of the possible inputs is no longer useful. At the extreme end of this trade off, we have models in which the intermediate steps are almost impossible to understand, called “black box” models. Early statistics courses introduce flexibility with quadratic models, and deal with the trade off by splitting the area of interpretation. Sadly this is not an idea that easily lends itself easily to more complicated models, a problem I have illustrated below.\n \nAs we start to get into more complicated models our interpretation methods slightly abandon this idea of localising our understanding and instead opt for completely new techniques, like permutation variable importance which I discussed in a previous post. Instead of inventing a new way to understand our models LIME tries to make the interpretation more “localised” in the same way that flexibility “localised” the model itself."
},
{
- "objectID": "posts/FlexiblevsInflexible/index.html#to-conclude",
- "href": "posts/FlexiblevsInflexible/index.html#to-conclude",
- "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
- "section": "To Conclude",
- "text": "To Conclude\nDon’t let your mum hang out with weirdos, and treat conspiracy theories and overly complicated models with scepticism."
+ "objectID": "posts/LIME/index.html#how-does-it-work",
+ "href": "posts/LIME/index.html#how-does-it-work",
+ "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
+ "section": "How does it work?",
+ "text": "How does it work?\nThe main idea of LIME is the same main idea of calculus, which is if we zoom in a bunch we can approximate crazy non-linear functions with straight lines. These approximations are pretty good around that point, but get worse the further we move away. The way it works is actually quite simple and can be broken down into a handful a simple steps. 1. Make a localised dataset based on a single observation 2. Build a model on this localised dataset 3. Interpret that model. Some of the technicalities of the process change depending on the type of data we have (tabular, images or text) and I will go through each of them, but in essence, the idea is the same. I’m going to walk through trying to predict cavities based on the three types of data to illustrate the process of LIME, but keep in mind, I’m assuming we already have some trained model that is making prediction, and a set of test observations. Is crushing pain, blackening teeth, or an exclusive diet of raspberry cordial a predictor of cavities? Lets find out.\n\nTabular Data\nThe first method we are going to look at is tabular data. Lets say instead of doing normal dentistry work my dentist wants to predict if I have cavities based on how often I say I brush my teeth, and how much sugar I eat a day. This is a hypothetical world and my hypothetical doctor is omnipotent apparently. He wants to classify his patients into 3 levels based on the financial commitment they are probably about to make to his family practice. He puts my teeth brushing and sugar intake into the model, and it looks like his family should start packing their swimmers, because they are about to go to Hawaii. But how did the model come up with the prediction? In enters, LIME.\nIllustrated below (and explained in this paragraph) is the process LIME will go through to understand this classification. First we select a single point to run our model on, in this case, me or an observation very close to me. Then LIME will generate some “pretend” data around it according to independent Gaussian distributions. As a side note, this means it ignores correlation, and can generate some points that are unlikely to occur in the real data. Then LIME will run our real data point point and all its fake friends through the black box model and find their hypothetical predictions. Similar to LOESS models, the observations are then reweighted based upon their distance to to the initial(only real) data point. Remember, we aren’t trying to understand the model overall, we are only interested in the area surrounding our point. Now, on our fake, weighted data, we train an easy to interpret model. Something like a tree model or linear regression. It doesn’t have to be even slightly similar to the black box model we are analysing, all that matters is that it is a model that is simple, easy to understand and easy to explain.\n \n\n\nImages\nSo my dentist is rubbing his hands together when my mum brings me in for a check-up. Once again ignoring normal dental procedures (I’m starting to wonder if this man is a dentist or some back alley scam artist my parents dug up to teach me a lesson) the dentist decides to take a normal photo of my teeth and predict the probability I have a cavity. His picture based model also suggests cavities, but once again, how did it make that decision? LIME is back to his rescue.\nOnce again we select some observation from our dataset, in this case, a photo of my sad decaying teeth. Next, following the tabular method, we would want to create a fake dataset of similar observations, but this is where we run into our first problem. Distance is easy to see in tabular data, its our normal run of the mill Euclidean distance. But how do we define distance for pictures? What metric can we use to say how similar two pictures are. This isn’t a question LIME answers, or even tries to answer but the little algorithm that could does it’s best to work through it. On pictures, rather than making our fake observations a sample that is “close” to the observation in distance, it varies the “superpixels” of the image. Superpixels are just a group of pixels that are next to each other and look similar so they are grouped together. for example, if you had a picture of my face; my skin, hair, lips, etc. would each be their own superpixel. To make our new dataset, LIME turns random super pixels off to create our local dataset. i.e. the pixels in that group cease to exist, are changed to 0, become a black hole of lost information in the land of data. Now we have a bunch of pictures that we run through the black box model to get some cavity prediction. Once again a simple model (like a linear regression) is built using the superpixels as inputs and the probability of a cavity as an output. The image is coloured by LIME based on having a positive impact on the classification or a negative impact.\n \n\n\nText\nFinally after my dentist(?) finishes his dental(?) work, he decides to predict the chance of an expensive revisit based on my conversation with my Mum on our way out. This is a simple classification problem again and the model predicts I will be back with an expensive cavity. Finally, the dentist(??) implements LIME one more time.\nThe method for text is almost identical to the images, only instead of superpixels, it turns words off and on."
},
{
- "objectID": "courses.html#master-of-business-analytics",
- "href": "courses.html#master-of-business-analytics",
- "title": "Courses",
- "section": "Master of Business Analytics",
- "text": "Master of Business Analytics\nArm yourself with computational statistical tools in the face of uncertainty.\nSome of the units in the MBAt are:\n\nETC5510 Introduction to data analysis\nETC5512 Wild-caught data\nETC5521 Diving deeply into data exploration\nETC5523 Communicating with data\nETC5500 Applied forecasting\nETC5250 Introduction to machine learning\nETC5450 Advanced R programming\n\nA full list of topics for the program can be seen here"
+ "objectID": "posts/LIME/index.html#limitations-of-the-method",
+ "href": "posts/LIME/index.html#limitations-of-the-method",
+ "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
+ "section": "Limitations of The Method",
+ "text": "Limitations of The Method\nThe obvious problem with LIME is the same thing that made it a good idea, and the same reason some people think the earth is flat. If we zoom in too much, we lose sight of the big picture. Since our understanding is limited to single real observation from our dataset, and running it on every observation would be computationally painful, it is at our discretion which observations, and how many observations we run LIME on to understand what is under the hood of a black box model. While I only went through a general understanding of how the process works, there are other posts out there that discuss practical implementation of the model and some of the more technical aspects of how it works which are certainly worth a read."
},
{
- "objectID": "contact.html",
- "href": "contact.html",
- "title": "Contact",
- "section": "",
- "text": "@numbats_rise_up\n Enter Education Building, we are located primarily on level 3, and to the east\n Monash University, Clayton Campus, Wellington Rd, Melbourne, 3800\n\n\nView Larger Map"
+ "objectID": "posts/LIME/index.html#in-conclusion",
+ "href": "posts/LIME/index.html#in-conclusion",
+ "title": "Baby Teeth are Temporary, Model Interpretability is Forever",
+ "section": "In Conclusion",
+ "text": "In Conclusion\nIf something is called “pink juice” it will give you cavities, and if your dentist uses machine learning algorithms instead of normal dental practices, he might not be an actual dentist.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.— title: “Untitled” editor: visual —"
},
{
- "objectID": "posts/pca/index.html",
- "href": "posts/pca/index.html",
- "title": "Using PCA to Bully My Housemates (Specifically Tom)",
+ "objectID": "posts/boosting/index.html",
+ "href": "posts/boosting/index.html",
+ "title": "Learning Boosting Through Me Getting Fired from Tutoring",
"section": "",
- "text": "I recently moved into a share house with three of my friends, and while we generally get along pretty well, I would be lying if I said I never fantasised about burning the place down with them all in it. Today, after I woke up to the dishwasher run with less that half a load, I made this passive aggressive drawing and sent it to the group chat. I have great conflict resolution skills.\n\nThe three people I live with all know me, but none of them know each other, and so as the central housemate, I have often wondered if this clear social dynamic appears in our communication (such as messenger data). This is something that could be easily found through a combination of a principal component analysis (PCA) and violating my housemates privacy. Both of which are enjoyable and straightforward. When PCA was introduced to me in uni, I struggled a bit to understand the plots. So, while I’m violating my housemates privacy, I’m also going to go over the ‘gist’ of PCA and unravel the plots that come with it."
+ "text": "I’ve had about… 13 jobs at this point in my life. Among them were jobs like tutoring, nannying, swim teaching, ect. so I have developed had a decent level of experience in teaching kids, specifically teaching them maths. While swim teaching doesn’t seem like it employs a lot of maths, I would play a “who can get closest to the number I’m thinking” game to decide who goes first. I then used it to explain game theory and how to optimise their strategy based on what the other kids would pick if they were acting as rational agents. They didn’t fully understand, but it was a fun exercise.\n\n\n\nI was never a very good tutor because I have a tendency to overcomplicate simple problems, and argue with the student’s teacher or parent. A recurring personality trait that is likely apparent after reading enough posts from this blog. The worst case of this was when I was fired from my tutoring job several years back. But what do my failures as a tutor have to do with boosting?.\n\n\nI have always seen boosting as the one of the most intuitive ensemble methods. For anyone who doesn’t know, an ensemble model combines many individual models to create one aggregate model that tends to have greater accuracy and less variance than any of the individual models. Think of it as the machine learning version of everyone voting to make a decision instead of a single expert making a decision. If we relate them to human studying, boosting is like doing every question at least once and then only revising the questions we previously got wrong. This makes boosting more similar to the way I study, and try to teach my (previous) tutoring students. In machine learning, boosting builds the models sequentially where each new model is built on the residuals of our current model, and only takes a small amount of predictive power from each (known as the learning rate). To see how this in action, lets look at the animation below.\n\n\n\nHere, I have made a boosting model using 50 regression trees that each consist of a single split, and have a learning rate (how much information we take from each tree) of 0.1. The colour represents the value for y. In the background we have the current predicted values for that area, and the actual data we are working with in the foreground. The size of the data represent the current error for that observation. It is pretty apparent that the data points become smaller as the background (predicted value) more closely resembles our training data. Each dashed line indicates the most recent regression tree (or in this case stump) that has been added to the model. Since this is a model that progressively learns, both the error and prediction change as we incorporate more and more models. Now that we have a visual on how boosting works, lets talk about tutoring."
},
{
- "objectID": "posts/pca/index.html#surrounded-by-incompetence",
- "href": "posts/pca/index.html#surrounded-by-incompetence",
- "title": "Using PCA to Bully My Housemates (Specifically Tom)",
+ "objectID": "posts/boosting/index.html#my-employment-history",
+ "href": "posts/boosting/index.html#my-employment-history",
+ "title": "Learning Boosting Through Me Getting Fired from Tutoring",
"section": "",
- "text": "I recently moved into a share house with three of my friends, and while we generally get along pretty well, I would be lying if I said I never fantasised about burning the place down with them all in it. Today, after I woke up to the dishwasher run with less that half a load, I made this passive aggressive drawing and sent it to the group chat. I have great conflict resolution skills.\n\nThe three people I live with all know me, but none of them know each other, and so as the central housemate, I have often wondered if this clear social dynamic appears in our communication (such as messenger data). This is something that could be easily found through a combination of a principal component analysis (PCA) and violating my housemates privacy. Both of which are enjoyable and straightforward. When PCA was introduced to me in uni, I struggled a bit to understand the plots. So, while I’m violating my housemates privacy, I’m also going to go over the ‘gist’ of PCA and unravel the plots that come with it."
- },
- {
- "objectID": "posts/pca/index.html#what-is-pca",
- "href": "posts/pca/index.html#what-is-pca",
- "title": "Using PCA to Bully My Housemates (Specifically Tom)",
- "section": "What Is PCA?",
- "text": "What Is PCA?\n\nThe Theory\nI would have just jumped into a nice example of understanding the plots, but for the sake of completeness I will explain how PCA works. The idea of PCA is to summarise the “information” of a dataset into its principal components (PCs), and then interpret those instead. These PCs are built to be linear combinations of our variables in their most “interesting” direction. Where “interesting” means the direction of most variance. Think of a linear regression but instead of projecting our results onto a line that uses x to capture as much information as possible about y, we are using both variables trying to capture as much information as possible in the x and y direction that has the most variance. Explaining this with words is a bit difficult, so I have drawn a visualisation of this below.\n\nFollowing on from this illustration, an easy way to understand principal components is to shift your current understanding of linear regression (I’m assuming you have some current understanding of linear regression). The variable loadings are similar to variable weights in the regression line. We interpret the loadings as “how much that variable contributes to the PC”. Our prediction for a value in linear regression is its projection onto the regression line (with the error shown in the above illustration in red). When working with PCA, our observation’s values are their projection onto the PC line. It is important to note that the red lines in in the PCA drawing is not error, but rather the “remaining” value that will then be used to build the second PC. This is just a quick overview of what these values represent (if you want something more technical look at a textbook or something, this isn’t a maths class). Now, lets take a quick look at the data we are working with.\n\n\nSharehouse Chat Data\nTo put some faces (albeit badly drawn cartoon ones) to names, here is an illustration of my housemates. I have also added a fun fact (checked by them after a large amount of “is this what you think of me” arguing) to help give an idea of their personalities. I’m basically introducing them like a 2000’s MTV dating show, but hopefully this will age better and be less racist/homophobic/sexist.\n\nThe data we are going to be working with is the Facebook messenger records of the sharehouse group chat. When I downloaded it, there were about about 6000 messages, over 3000 of which were sent by me. I was originally interested in analysing all my messenger data but seeing that number stung enough for me to pretend I didn’t download my other chat files. I’d rather live in ignorance than face the fact that I feel the need to update all my friends on everything I do.\nSo, after a bit of cleaning (removing punctuation, removing stop words, breaking observations up into single words, counting the frequency by person, diving each value by total number of words said in the chat by that person) I have a my dataframe. Each variable is someone who lives in the house, each observation is a word, and the values are how many times that word was said relative to the number of words that person has sent in total. So my value for the word “tom” is how many times I have said “tom” as a fraction of all the words I have sent to the chat. I could skip making the values a “relative” frequency, but then our PCA would likely just tell us that I am absolutely incapable of shutting up, rather than what words typify each speaker. Below is a glimpse at the data that we will run through the PCA.\n\n\n\n\n\nword\nHarriet\nZac\nEm\nTom\n\n\n\n\ntom\n0.0178\n0.0171\n0.0107\n0.0081\n\n\nhouse\n0.0165\n0.0155\n0.0135\n0.0129\n\n\nzac\n0.0149\n0.0078\n0.0088\n0.0064\n\n\nem\n0.0127\n0.0194\n0.0000\n0.0081\n\n\nyeah\n0.0090\n0.0248\n0.0334\n0.0274\n\n\ntime\n0.0091\n0.0148\n0.0102\n0.0113\n\n\n2\n0.0080\n0.0054\n0.0088\n0.0064\n\n\nshit\n0.0080\n0.0062\n0.0060\n0.0000\n\n\nstuff\n0.0074\n0.0023\n0.0037\n0.0064\n\n\npeople\n0.0067\n0.0078\n0.0042\n0.0081\n\n\n\n\n\nNow that we have some data, lets discuss how we interpret the loadings."
+ "text": "I’ve had about… 13 jobs at this point in my life. Among them were jobs like tutoring, nannying, swim teaching, ect. so I have developed had a decent level of experience in teaching kids, specifically teaching them maths. While swim teaching doesn’t seem like it employs a lot of maths, I would play a “who can get closest to the number I’m thinking” game to decide who goes first. I then used it to explain game theory and how to optimise their strategy based on what the other kids would pick if they were acting as rational agents. They didn’t fully understand, but it was a fun exercise.\n\n\n\nI was never a very good tutor because I have a tendency to overcomplicate simple problems, and argue with the student’s teacher or parent. A recurring personality trait that is likely apparent after reading enough posts from this blog. The worst case of this was when I was fired from my tutoring job several years back. But what do my failures as a tutor have to do with boosting?.\n\n\nI have always seen boosting as the one of the most intuitive ensemble methods. For anyone who doesn’t know, an ensemble model combines many individual models to create one aggregate model that tends to have greater accuracy and less variance than any of the individual models. Think of it as the machine learning version of everyone voting to make a decision instead of a single expert making a decision. If we relate them to human studying, boosting is like doing every question at least once and then only revising the questions we previously got wrong. This makes boosting more similar to the way I study, and try to teach my (previous) tutoring students. In machine learning, boosting builds the models sequentially where each new model is built on the residuals of our current model, and only takes a small amount of predictive power from each (known as the learning rate). To see how this in action, lets look at the animation below.\n\n\n\nHere, I have made a boosting model using 50 regression trees that each consist of a single split, and have a learning rate (how much information we take from each tree) of 0.1. The colour represents the value for y. In the background we have the current predicted values for that area, and the actual data we are working with in the foreground. The size of the data represent the current error for that observation. It is pretty apparent that the data points become smaller as the background (predicted value) more closely resembles our training data. Each dashed line indicates the most recent regression tree (or in this case stump) that has been added to the model. Since this is a model that progressively learns, both the error and prediction change as we incorporate more and more models. Now that we have a visual on how boosting works, lets talk about tutoring."
},
{
- "objectID": "posts/pca/index.html#the-loadings",
- "href": "posts/pca/index.html#the-loadings",
- "title": "Using PCA to Bully My Housemates (Specifically Tom)",
- "section": "The Loadings",
- "text": "The Loadings\n\nThe Theory\nThe loadings have two things about their interpretation that make them a bit tricky to understand: 1. We are plotting what would be on the axis of the plot in our typical scatter plot (the variables) as observations 2. We are using these “observations” to understand the axis (our PCs). I have drawn this relationship below for additional clarity.\n\nNote: these aren’t the actual loading values of PC1 and PC2 from the example below, this is just an illustration\nTo make matters even more complicated, we usually plot our PCA on a biplot with both loadings and observations. We will make and interpret this plot at the end, but since this is an “introduction to understanding PCA plots” we are going to start with only plotting the loadings, and work our way to the biplot.\nTo interpret our loadings we need to keep three things in mind: 1. The principal components summarise information in descending order of importance. This means that each PC will represent a more overt trend in the data than the PC that follow it. 2. The direction of the PCs is the most important take away. If all your loadings are in the same direction then this PC is analysing the ways in which all your variables are the same. If they move in opposite directions, the PC is identifying a juxtaposition. The actual direction of the loading (positive or negative) doesn’t matter too much outside of the loading’s direction relative to the others. This might seem a bit confusing, it will make more sense once we look at the first loading in the example below. 3. The magnitude of the loading is the least important part. If you start getting so detailed that you are thinking deeply about the magnitude, you are likely overcomplicating the problem for yourself. Just pay attention to the loadings that are significantly different from 0 (I marked these using a red line in the example).You can find your significance line as \\frac1{\\sqrt{p}} where p is the number of variables in your PCA (in the example it’s 4). As with anything, this will be easier to understand with an example, so lets just look at what the sharehouse PCA produced.\n\n\nSharehouse Chat Loadings\nTo start off with, we need to use the loadings to interpret the PCs. The first two PC’s capture most of the variance, and so typically we focus on those two, however since we only have 4 variables (and so 4 possible PCs) I might as well do them all.\n\nKeeping in mind what we covered above, we can analyse these plots. As a side note, the order of names (the x-axis of these plots) are arbitrary and organised only to make the words readable, so we only need to interpret the y-axis (the PC loadings). To begin lets start with PC1, the most important PC. Since all the loadings are negative, any persons use of a word will give that word a negative value on the first PC. To put it simply, words we say a lot as a combined group will have a large negative score, and words that we never say will sit around 0. There wont be any positive values on PC1 because each word’s value is the Housemate'sPCLoading\\times{Housemate'sWordFrequency}, summed up for all 4 of us. So since none of the words will have a negative frequency that could cancel out the negative loadings word’s wont have positive value on PC1. Here are the 4 loading interpreted in their positive direction:\nPC1: Words None of us say - The overarching ways in which the four of us are similar thanks to generation and circumstances (of living together). This PC will likely contain words people who live together and people our age use. PC2: Words Tom never says - Out of all of us, the most distinct speaker of the group is Tom. PC3: Words that Em uses - Em is the next most distinct. PC4: Words Differentiate Zac and I - Zac and I were on the same side of all the other loadings, and so once all the other sources of variance have been dealt with, this is all that is left. It makes sense, as we are the oldest and closest friends, so our speech is the most similar.\nInterestingly, the loadings captured the underlying dynamics of the group pretty well. Since the PCs are organised such that they explain decreasing variance, this tells us that the overarching patterns of speech between the 4 of us (PC1) is more salient than the difference between Tom’s and the rest of us (PC2) and so on. I have drawn the social circles identified by the PC loadings below, both as an illustration of the analysis, and to personally attack Tom. Using this understanding of our new variables (the PCs) we can interpret our observations, just as we would normal variables.\n\nAnother note I want to make is that I could have set up this data frame so that the words were the variables instead of the sender (I actually did this the first time without thinking). The main problem with this comes in the analysis. If the variables are words and the largest loadings come from “yeah”, “tom” and “house”, it is hard to understand how these words are similar, and how they are different. That analysis is much easier to do on people, because I have prior understanding of the context of those variables."
+ "objectID": "posts/boosting/index.html#part-1-focusing-on-mistakes",
+ "href": "posts/boosting/index.html#part-1-focusing-on-mistakes",
+ "title": "Learning Boosting Through Me Getting Fired from Tutoring",
+ "section": "Part 1: Focusing on Mistakes",
+ "text": "Part 1: Focusing on Mistakes\n\nIf you get 100%, You don’t need tutoring.\nThe interaction that got me fired from my tutoring company was with a kid I’ll call, Riley. After being begged to take him on as a student (he was a 5th grader and I teach high school maths) they sent me the information I needed to teach Riley. The first was an email from his teacher that read like this: Hi Mrs Riley, I’m not sure why you are getting your son tutoring considering he has no trouble in class. I have nothing he needs to work on. Maybe the tutor could teach him next semester’s content, but then he would just be bored in class so I wouldn’t recommend it.” I think, great, not only does this kid not need tutoring, but his parents are actively going against his teachers advice. Not a good sign. Next I read a note from the last tutor. “I just bring a computer game or a worksheet for him to do, and then mark it” Double great. This comment was even worse. I was clear this kid had nothing to learn, so it didn’t matter what the last tutor did with him. A tutoring session of watching a kid do things they already knows how to do with no useful feedback can go completely unnoticed. You get the most “bang for your buck” focusing on your worst areas, as they are both the areas requiring the most improvement, and are forgotten the fastest. I incorporate this attitude to every aspect of my life. You can see how in the visual below.\n\n\n\nIf you are just revising things you already know with 100% accuracy, you are not learning.\n\n\nBuilding Models in the Residual Space\nIf we build an ensemble model that is 50 models, each identical and with perfect predictions, we get the same result as if we made one. This is just wasting computational power much in the same way Riley’s family was wasting money on tutoring. In boosting, since each model is built on the residuals of previous models, it is trying to make sure that it does not repeatedly learn things it already knows. The model focuses on the most common, frequent, and damning errors, and works its way back from that. In the first animation, I let the size represent the errors, but each model is not built using the response variable, it is built using the residuals. Here, using the exact same data and model above, I have instead animated each individual tree as it tries to predict the residuals.\n\n\n\nWe can see that when we start our boosted model, the residuals are essentially our y value (since the initial prediction for the whole area is 0), and as the main model becomes more accurate, the residuals become 0, and new trees don’t have any information to contribute to the model. If the model continued much further, it would just randomly build trees on the irreducible error.\nBy focusing on the residual space, the model ensures that we aren’t wasting computations by relearning something we already know. In a similar way, the best way to learn as a human is not to revise the areas we get 100% in, but rather the areas we are failing in as they offer the most room for improvement."
},
{
- "objectID": "posts/pca/index.html#understanding-observations",
- "href": "posts/pca/index.html#understanding-observations",
- "title": "Using PCA to Bully My Housemates (Specifically Tom)",
- "section": "Understanding Observations",
- "text": "Understanding Observations\n\nThe Theory\nUnderstanding the observations is very straight forward once you have the PC interpretations. Usually when analysing our data, the process looks something like this:\n Variable Meaning -> Understand Observations \nFor example, a low time in a 100m sprint can be interpreted as fast. Obviously, PC1 does not have an inherent meaning to us in the same way that the time for a 100m sprint does, but that is what the loading interpretations was for. The process for understanding the data plots in PCA is:\n Construct PCs -> Use loadings to find PC meaning -> Understand Observations \nSo from this we can see that the interpretation of data in PCA vs regular analysis is almost the same, there is just an extra step (which we have already done in our example) that can complicate it a bit. Now that we understand how to interpret the observations in the PCA, let’s apply this to the sharehouse chat data to finish off the analysis.\n\n\nSharehouse Chat Observations\n\nHow do we interpret these plots? Well we need to use our interpretations of the loadings to understand what our axis represent. Since we established that PC1 represents words we all use, the distance below the line indicates how frequently the word is used between us all. For example, “yeah” and “house” are the most used words across the chat. This makes sense as we are pretty informal and all live together. We can do the same thing for PC2, which identified the ways Tom speaks differently. He uses “nbn” a lot because he is the one who set up the internet. “Tom” is a common word for Zac and I, not only because we love to bully our housemate Tom, but because we also have a few mutual friends (and some not friends) called Tom that we talk about in the chat.\nI sent all these plots to the group (I like to keep them informed) and Em said “I’m apparently the only one who laughs in this chat”. Now this brings up an interesting point in how this analysis was run, and it shows how PCA can bring out some patterns that may not be immediately recognisable in the data.\nThe data cleaning will correct for things like capitalisation (so here Here and HERE are all the same word) but if the words differ by letters (here and herez) thanks to typos or spelling mistakes, they are registered as different words. This creates a problem for registering words that I typically use, since: 1) I’m an absolutely abysmal speller and rarely spell a word the same way twice; and 2) I type laugher according to how funny I think something is (haha vs hahahahahahaha) This means, someone like Zac who almost always laughs in the same way with “lmfao”, or Em with “hahaha” and “hahahaha’, have all their chat laughter collected into one observation. Looking through the records I laugh to a similar degree, but almost all of them are recorded as unique words in the frequency count, and thus don’t make it to the analysis. Tom just never laughs at anything."
+ "objectID": "posts/boosting/index.html#part-2-the-learning-rate-the-number-of-models-and-the-model-complexity",
+ "href": "posts/boosting/index.html#part-2-the-learning-rate-the-number-of-models-and-the-model-complexity",
+ "title": "Learning Boosting Through Me Getting Fired from Tutoring",
+ "section": "Part 2: The Learning Rate, The Number of Models, and The Model Complexity",
+ "text": "Part 2: The Learning Rate, The Number of Models, and The Model Complexity\n\nRash Decisions in Tutoring Is a Dangerously Simple Method\nWhen I arrive at Riley’s house, I explain I don’t have any computer games or worksheets because I disagree with them morally, however I could cover future school work and invent some fun questions. Riley’s mum was not a big fan of my moral plight to take down “big tutoring”. After a brief discussion about how “we are all a little disorganised” which everyone knows is mum code for “you are disorganised”, she sent me home. Later I received a call from my boss about being “ill-prepared” because I should have just brought computer games and worksheets like the last tutor recommended. I explained my side, and by boss was sympathetic, but I never got another tutoring job from them again. Unfortunately, due to Riley’s mum being unsupportive of trying new teaching methods, the best speed at which Riley should cover new content wont be found. He might have learnt better with longer sessions, or with another student, or doing literally anything other than playing computer games. Much in the same way that we can tailor the environment and complexity of a tutoring session, boosting can improve its predictions by changing the learning rate, number of models and the model complexity.\n\n\nTinkering the Complexity of the Boosting Model\nWhen using boosting, we need to be aware of how the learning rate (or shrinkage), the number of models and the model complexity impact our final prediction. The learning rate decides how much “predictive power” we take from each trees. Smaller learning rates need more models to get a working prediction, larger learning rates run the risk of giving too much power to outlier models, and missing minor complexities. The number of models (trees in our example) is just decided in parallel with the learning rate, and is essentially how much computational time we are willing to dedicate to our model. The depth of the tree is similar, in the sense that with enough trees, a stump tree can capture any relationship, however if we don’t have the capacity for enough models, we can increase the complexity of each individual model to add more nuance to the final prediction."
},
{
- "objectID": "posts/pca/index.html#biplot-putting-it-all-together",
- "href": "posts/pca/index.html#biplot-putting-it-all-together",
- "title": "Using PCA to Bully My Housemates (Specifically Tom)",
- "section": "Biplot: Putting It All Together",
- "text": "Biplot: Putting It All Together\nNow these plots only show one principal component each, and also don’t have the loadings on them. I started by separating the elements of the plot, but making several plots when the information could be conveyed with a single plot is tiresome. Now that we understand each of the components by themselves, lets make a biplot to show how this information is usually conveyed all together.\n\nTypically we use the first two principal components when we build the biplot because they contain the most variance, and thus the most information. This final plot is usually how a PCA is presented to us, with the observations and loadings plotted together and each axis representing a principal component. While the plot looks a little different now, the interpretations are still the same, and as a matter of fact understanding the observation is a little easier than before. Since we have the loadings on the plot too, we no longer need to hold the interpretation of the PCs in our mind to understand the observations. On the x axis, the further to the left a word is, the more we all use it, on the y-axis, the further down an observation is, the more Tom specifically uses it. Now we can make analysis of our observations using this combined information, rather than separating it. For example, looking at the biplot we can see that while “tom” is used a LOT in the chat overall, that is largely due to Zac and I, rather than Tom saying his own name.\nThe biplot allows us to summarise most of the information covered in this post in a single figure, and knowing how to interpret it makes your life much easier. That being said, if you have a lot of loadings you might still need to separate the plots as a biplot can get messy and crowded when we have too many."
+ "objectID": "posts/boosting/index.html#part-3-need-to-know-when-to-quit",
+ "href": "posts/boosting/index.html#part-3-need-to-know-when-to-quit",
+ "title": "Learning Boosting Through Me Getting Fired from Tutoring",
+ "section": "Part 3: Need to Know When to Quit",
+ "text": "Part 3: Need to Know When to Quit\n\nOverfitting in Learning\nI know someone has spent too long studying when I see forum posts asking if some obscure topic is going to be on the exam. Once you have run out of things to focus on that are important, you start to focus on the things that are less and less important, until you are sitting awake at night crying about the sheer vastness of knowledge that you could never hope to learn. Knowing when to quit is an important part of life and machine learning. Most people tell other to “try try and try again” my motto is “if you aren’t feeling it, quit”. After several years of tutoring, I was no longer feeling it, and it was time to quit. It turns out repeatedly being told “the continuity of functions doesn’t matter” and “dividing a number by 0 is 0” my soul had been crushed and I wasn’t doing my job properly any more. I had too much baggage and it was time to quit. Just like with tutoring, boosting needs to know when to quit too.\n\n\nBoosting can Overfit\nUnlike in bootstrapping, boosting has the potential to overfit. Since the later predictions are the cumulative prediction of all the models that came before, and the new models are only concerned with what those models got wrong, the overall benefit of each model is less than the model before it. This means that eventually, the tangible benefit of building a new tree becomes zero. Because of this, we always need to be aware of our ensemble complexity and manually set a stopping criteria."
},
{
- "objectID": "posts/pca/index.html#conclusion",
- "href": "posts/pca/index.html#conclusion",
- "title": "Using PCA to Bully My Housemates (Specifically Tom)",
+ "objectID": "posts/boosting/index.html#conclusion",
+ "href": "posts/boosting/index.html#conclusion",
+ "title": "Learning Boosting Through Me Getting Fired from Tutoring",
"section": "Conclusion",
- "text": "Conclusion\nWhile PCA plots can seem confusing at first, once you break them down into their components, they are pretty straight forward to understand. Also Zac said I need to include his twitter handle which is @zaccheus_e so I can direct people to an illiterate and poorly structured rebuttal."
+ "text": "Conclusion\nBoosting employs three techniques that make it similar to effective human learning. First it focuses on mistakes, secondly it is important to tailor the complexity of any one session, and finally it need to be manually stopped or otherwise your model will stare into the abyss of the unknowable in existential dread.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
},
{
- "objectID": "posts/2022-05-28-ggplot-sf/index.html",
- "href": "posts/2022-05-28-ggplot-sf/index.html",
- "title": "How long do maps on ggplot facets take?",
+ "objectID": "posts/hackathon_2023/index.html",
+ "href": "posts/hackathon_2023/index.html",
+ "title": "Hackathon 2023",
"section": "",
- "text": "If you’re a ggplot user, making faceted plots must be a tool in your belt. If you happen to do some spatial analysis, you would be familiar with maps. Today, I will show you my surprise findings about the rendering time to make faceted maps.\nThis example comes from Chapter 7 of Paula Moraga’s book Geospatial Health Data: Modeling and Visualization with R-INLA and Shiny and I have simplified it for this demonstration. In essence, there are two datasets:\nSimple feature collection with 88 features and 1 field\nGeometry type: POLYGON\nDimension: XY\nBounding box: xmin: -84.8203 ymin: 38.40342 xmax: -80.5182 ymax: 42.32713\nGeodetic CRS: NAD83\n# A tibble: 88 × 2\n NAME geometry\n <chr> <POLYGON [°]>\n 1 Auglaize ((-84.13476 40.65755, -84.13467 40.65755, -84.13405 40.65753, -84…\n 2 Crawford ((-82.77258 40.99589, -82.77258 40.99588, -82.77168 40.99588, -82…\n 3 Montgomery ((-84.06231 39.8366, -84.06301 39.83665, -84.06501 39.83677, -84.…\n 4 Guernsey ((-81.22986 40.06315, -81.22987 40.06308, -81.22992 40.06119, -81…\n 5 Clark ((-83.83875 39.8233, -83.83889 39.82335, -83.83904 39.82339, -83.…\n 6 Gallia ((-82.18737 38.72608, -82.18727 38.72558, -82.18707 38.72488, -82…\n 7 Fairfield ((-82.82307 39.80773, -82.82307 39.8078, -82.82305 39.80801, -82.…\n 8 Darke ((-84.43157 40.15801, -84.43148 40.15487, -84.43148 40.1542, -84.…\n 9 Monroe ((-81.22569 39.57838, -81.24065 39.57883, -81.2413 39.57885, -81.…\n10 Portage ((-81.3184 40.98861, -81.31892 40.98862, -81.31927 40.98862, -81.…\n# … with 78 more rows\n# A tibble: 1,848 × 3\n county year SIR\n <chr> <dbl> <dbl>\n 1 Adams 1968 0.725\n 2 Adams 1969 0.588\n 3 Adams 1970 1.03 \n 4 Adams 1971 0.654\n 5 Adams 1972 1.05 \n 6 Adams 1973 0.693\n 7 Adams 1974 1.15 \n 8 Adams 1975 1.17 \n 9 Adams 1976 0.936\n10 Adams 1977 0.644\n# … with 1,838 more rows\nThe details on calculating SIR is not the focus of this post and Section 7.1 to 7.2 of Paula’s book has detailed all the steps. Here I attach the script to generate these two data sets in case you would like to give it a spin:\nWhat we would like to do here is to show the SIR values of each county on the map across years. This would require us to join the two datasets, supply the combined data into ggplot, plot the underlying map, fill the county polygon with SIR, make facets with year, and lastly tweak the theme and the fill scale. Let’s give this plot a name, say target:\ncombined <- ohio %>%\n left_join(sir, by = c(\"NAME\" = \"county\"))\n\ntarget <- combined %>%\n ggplot() +\n geom_sf(aes(fill = SIR)) +\n facet_wrap(~year, dir = \"h\", ncol = 7) +\n ggtitle(\"SIR\") +\n theme_bw() +\n theme(\n axis.text.x = element_blank(),\n axis.text.y = element_blank(),\n axis.ticks = element_blank()\n ) +\n scale_fill_gradient2(\n midpoint = 1, low = \"blue\", mid = \"white\", high = \"red\"\n )\n\ntarget\nEasy peasy.\nBut, have you thought about how long it would take to provide this plot to you?\nLet me show you some components of this plot as benchmarks on the timing, here I have:\nOkay, now it is your time to make a guess:"
- },
- {
- "objectID": "posts/2022-05-28-ggplot-sf/index.html#footnotes",
- "href": "posts/2022-05-28-ggplot-sf/index.html#footnotes",
- "title": "How long do maps on ggplot facets take?",
- "section": "Footnotes",
- "text": "Footnotes\n\n\nTo make a proper benchmark of time, ideally each plot (p1 - p21) should be evaluated repetitively to obtain a distribution of the elapsed time. I set up a script with 50 repetitions and let it run overnight, but what I got next morning was “RStudio quit unexpectedly”. I suspect there is something going on with opening and closing the graphic devices too many times…↩︎"
+ "text": "Overview\nThe third NUMBAT hackathon was held Feb 22-24, 2023 in San Remo, Vic. A hackathon is style like RopenSci events where attendees brainstorm potential projects, and join to gether in small groups to tackle a subset of these.\nProjects\nProjects tackled can be found on github.\n\n\n\nParticipants\n\n\n\n\n\nBrainstorming the projects\n\n\n\n\n\nMaking sushi\n\n\n\n\n\nAussie barbecue - love the aprons!\n\n\n\n\n\nPelican feeding"
},
{
- "objectID": "posts/2022-02-18-do-you-want-a-monash-business-analytics-intern/index.html",
- "href": "posts/2022-02-18-do-you-want-a-monash-business-analytics-intern/index.html",
- "title": "Do you need some analytics help? Maybe our internship program is for you!",
+ "objectID": "posts/2022-10-18-cran-rev-dep/index.html",
+ "href": "posts/2022-10-18-cran-rev-dep/index.html",
+ "title": "Diving into dependen-“sea”",
"section": "",
- "text": "We have a new Masters in Business Analytics in the Econometrics and Business Statistics department in the Monash Business School and in the final semester of the program, the students have an option to do an internship. The aim is to give our students some work experience in analytics and to build links with various companies, institutions, and charities.\nThe program is highly selective and has an annual intake of around 50-60 student. The majority of our students will undertake the internship component of the program.\nThe students will have advanced R and data science skills. As well as having experience with various types of statistical analysis and machine learning, they have strong training in modern techniques for building reproducible analysis pipelines, automated reporting, and building interactive tools for understanding data.\nThe students will also meet regularly throughout the project with an academic from Monash and their fellow students to discuss problem solving and analysis techniques that are relevant to their projects. Previous students have worked on a diverse selection of projects including"
+ "text": "When writing a package, we may want to use functions in other packages. This creates a dependency for our package and a reverse dependency on the package we borrow functions from. As one of the recipients of the isoband email1, I’m curious to know how interconnected CRAN packages are. Luckily, it is not too hard to get data on this, and so the journey begins…"
},
{
- "objectID": "posts/2022-02-18-do-you-want-a-monash-business-analytics-intern/index.html#footnotes",
- "href": "posts/2022-02-18-do-you-want-a-monash-business-analytics-intern/index.html#footnotes",
- "title": "Do you need some analytics help? Maybe our internship program is for you!",
+ "objectID": "posts/2022-10-18-cran-rev-dep/index.html#footnotes",
+ "href": "posts/2022-10-18-cran-rev-dep/index.html#footnotes",
+ "title": "Diving into dependen-“sea”",
"section": "Footnotes",
- "text": "Footnotes\n\n\nMonash will provide insurance for all students provided they are not classified as employees.↩︎\nThis may be anonymised, deidentified, or redacted as required, as long as it is still possible to assess the student’s work.↩︎\nWe are happy to discuss other options for assessing the code if using github or sharing the code is not feasible for your company. This should be decided before the internship begins.↩︎"
+ "text": "Footnotes\n\n\nOn 5th Oct, CRAN sent out a massive email to inform 4747 downstream package maintainers of the potential archive of package isoband on 2022-10-19.↩︎"
},
{
- "objectID": "posts/bias_variance_flexibility/index.html",
- "href": "posts/bias_variance_flexibility/index.html",
- "title": "A Deep Dive into How Flexibility Affects The Bias and Variance Trade Off",
+ "objectID": "posts/MLE/index.html",
+ "href": "posts/MLE/index.html",
+ "title": "How a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)",
"section": "",
- "text": "When we are building a machine learning model you have a choice of a simple, which would be an inflexible, model vs a complicated, or very flexible model. We need to decide how flexible the model should be to work well for future samples. An inflexible model may not reflect a complex underlying process adequately and hence would be biased. A flexible model has the capacity to capture a complex underlying process but the fitted version might change from one sample to another enormously, which is called variance. This difference is illustrated in the figure below.\nWhen we think of how the bias and variance change with flexibility, we typically only look at its behaviour on average. In the plot below, the left side corresponds to an inflexible model and the right side corresponds to a flexible model. We can see that the test error stay slightly above the training as flexibility increases, until the text error shoots up. Visualisations like this are shown frequently in the textbook “An Introduction to Statistical Learning with Applications in R” by Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani, which largely inspired this blog post. While this explains the behaviour of our test error on average, it doesn’t give a complete understanding of how our test error estimate will act within any individual sample. This is where we find the benefit of understanding the error distribution. The distribution of the test error allows us to not only understand the average behaviour, but also how that behaviour may change from sample to sample."
+ "text": "As a child, your parents are seen as deities that can do no wrong, that is until you are doing a first aid course 10 years later and learn that a broken arm is not an “I’ll take you to the hospital tomorrow if it still hurts” level emergency. Growing up I started to realise my Dad’s life lessons were somewhat unorthodox and below are some of my favourite quotes.\n\n“If you are going to light fires with your brother, make sure you do it by the pool. The ground is paved and if you set something on fire I’d rather it be you than the house”\n“I’m not going to any of your soccer games until you turn 12. I watched your brother play when he was younger. It was very boring and the other kids parents were insufferable”\n“If someone wants you to do something, they will probably pay you for it. So make sure you get paid. Unless I ask, then you need to do it for free.”\n\nThe last quote was probably the worst thing he taught us, at least for his financial security. It meant my siblings and I learnt to squeeze as much money out of my parents as we could. They paid me to cut my hair, get to school on time, go to swimming lessons, nothing was ever done for free. I even haggled my baby teeth with my mum for $50. This idea expanded to school yard, where my peers were much poorer than my parents, but also easier to part from their money. In grade 2 I had a period of selling stickers outside the tuckshop for spare change. The profit system was simple, sell stickers to my peers for a 10000% mark up. This plan was eventually shut down by “the man” i.e. the staff because some parents had complained about their kids not even getting lunch and just buying stickers. My most effective and long lasting money making plan however, was birthday parties. By the end of middle school, I was pulling in about $2000 a party. One of the most important elements in making a birthday profitable was the ratio of kids invited to kids that turn up. At the time I did guesstimation on this ratio, but now we are going to look at it in a more formal manner, using Maximum Likelihood Estimation (MLE)."
},
{
- "objectID": "posts/bias_variance_flexibility/index.html#flexibilitys-influence-on-test-error",
- "href": "posts/bias_variance_flexibility/index.html#flexibilitys-influence-on-test-error",
- "title": "A Deep Dive into How Flexibility Affects The Bias and Variance Trade Off",
- "section": "Flexibility’s Influence on Test Error",
- "text": "Flexibility’s Influence on Test Error\nWhen changing the flexibility of a model, the test error distribution will go through three phases, that affect both its expected value, and variance.\n\nPhase 1: Decreasing Bias in Model\nWhen our model is biased, we are forcing our data into constraints that don’t reflect the true relationship between the variables. Since we have not captured the true relationship of the parameters, any sample drawn from our population will also have a more complicated relationship than that of our model, and have error from bias. This relationship is illustrated below, where our high error is largely the result of too much bias in the model. Both distributions are similar to each other, but far from zero.\n\n\n\n\n\nPhase 2: Optimal Fit\nIncreasing the flexibility will reduce the bias which will decrease the error. The optimal error will have smaller error for both training and test, but they will both be pretty similar. If you have captured the true relationship of the data with your model (if there is one), the distributions should perfectly overlap. This is unlikely to happen, since your model will always have a bias towards any quirks in your training set, and thus perform better on that set most of the time. So we instead will interpret the optimal fit is when the test error reaches its minimum (before the variance causes the total error to start to increase).\n\n\n\n\n\nPhase 3: Increasing Variance in Model\nAs we start to overfit our model, we introduce more error from variance than we are losing from decreasing bias. This has two effects on the distribution of the estimated test error. First, it causes the distribution to shift upwards as we have once again missed the true relationship in the population. This miss is different from bias however, as we have overfit our model to the specifics of the test set sample, thus new samples drawn from the same population will not have a similar error. This causes the distributions to shift away from each other. Additionally, the variance of the test error estimate will also increase. Overfitting means a higher penalty for samples that just happen to be different from our training set, and a higher reward for those that just happen to have similar quirks. Ultimately that makes the estimates more unreliable, and thus have a higher variance.\n\n\n\n\n\nUnderstanding with an Example\nThis influence from flexibility can best be seen with an example. To illustrate this, we will use the Auto data from the ISLR package, and fit a model to predict mpg using a polynomial of horsepower. If we take a look at the scatterplot of the two variables below, we can see that the linear model might not be flexible enough, but anything more flexible than a polynomial of about 4, will very likely overfit to the training sample. The plot below shows the data with a loess fit.\n\n\n\n\n\n\n\n\n\nWe can see the effect on the distributions using the animated density plot below. Here we have taken 100 different samples, and fit a model that predicts mpg using a polynomial degree of 1 to 15 of horsepower. Here we can see the above hand drawn illustration and interpretation of the variable relationship play out. Initially, increasing the flexibility of our model eliminates bias and causes both distributions to shift down. At polynomial degree 4, they stop at the minimum, and then for polynomial degrees higher than that, variance is introduced, and the test error increases in both expected value and variance.\n\n\n\n\npng"
+ "objectID": "posts/MLE/index.html#step-1-identify-joint-density-function",
+ "href": "posts/MLE/index.html#step-1-identify-joint-density-function",
+ "title": "How a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)",
+ "section": "Step 1: Identify Joint Density Function",
+ "text": "Step 1: Identify Joint Density Function\nI mentioned in the beginning that the function that can easily cross between the worlds of the outcomes and parameters is the joint density function, but what is it? Basically it takes a bunch of random variables (in our case, students) and says what is the chance the entire group has a specific outcome (in this case, attend or not attend). \nMoving from each students probability to the joint probability is not simple task. We can either create a model that understands the intricate inter-personal relationships of this second grade class, or we can make 3 assumptions that will greatly simplify the problem. Statisticians generally prefer the method that doest require reading children’s diaries and so we are now going to perform these assumptions on my second grade class.\n \n\nAssumption 1: Independence\nThe first assumption we are going to make about our students is that they are independent, i.e any particular kid going to my party is not at all influenced by any other student. In doing so we now have a dataset consisting of children who are completely immune to peer pressure, both a school administration and statisticians fantasy. Unfortunately we have also lost some realism along the way. Why do this? Well, right now the only thing we know is each student has a some probability mass function (which we will get to in assumption 3) but no information on how the students interact with each other. What if one student coming means other wont come? or a students coming means another will certainly come? In order to find the probability of this very specific outcome we have ended up with we need information about the variables relationship. Here we can either figure out the complicated interpersonal relationships of the children, or assume they are all independent. With this assumption, the joint PMF is the product of each individual PMF (this is literally the definition of independence). Now our students don’t interact, and we have taken our first step in simplifying our problem.\n \n\n\nAssumption 2: Identical\nNow our joint PMF is the product of 24 unique PMFs. The problem is, I don’t really care about the individual students (they are all dollar signs to me). I only care about the overall proportion of students. Here we can simplify our problem further by assuming there is some overall class PMF, and every student is just a random variable drawn from that. To use this assumption in our joint density function, we just say the probability of every student coming is the same. Now we have 24 observations drawn from a single distribution, which means we only need a single individual PMF to define the PMF of all the students.\n \n\n\nAssumption 3: Identify The Distribution For Individual Parameters\nAs a final step, we still need some individual PMF to put in the big product we have created. Since every student either comes or doesn’t come, we can easily say the PMF for each student follows a Bernoulli distribution. Ultimately this step just depends on what outcome you want to measure, and since I only really care about a yes/no outcome, a Bernoulli will do just fine. Now we have a joint PMF to work with."
},
{
- "objectID": "posts/bias_variance_flexibility/index.html#sample-to-sample-changes",
- "href": "posts/bias_variance_flexibility/index.html#sample-to-sample-changes",
- "title": "A Deep Dive into How Flexibility Affects The Bias and Variance Trade Off",
- "section": "Sample to Sample Changes",
- "text": "Sample to Sample Changes\nHere it is important to highlight the difference between a population and a sample, so we can better understand how an unfortunate test and training split can hurt error estimates. A population is all the data on what you are trying to make an inference on. For example, if I want to make an inference on the true relationship between mpg and horsepower, the Auto data is a sample of that. Generally we would be interested to make statements for mgp and horsepower for all possible cars, where all possible cars would be our population. If I want to make an inference on the relationship between mpg and horsepower in the Auto dataset (which is a weirdly specific realm to keep your inferences to but each to his own I guess) then this data is the population sample. For our sample to be representative, it needs to both be randomly drawn, and large enough. Unfortunately, even when we draw our samples to be decently large in size, and random, we will still occasionally get some unrepresentative splits. A sample that is unlike the population will bring the validity of any inference we try to make using that sample (including predictive models) into disrepute. Below is an illustration on how the sample will influence the fit among other interpretations.\n\n\n\n\npng\n\n\n\nThat being said, it’s highly unlikely to get a difference that dramatic in an actual sample. In reality, minor, almost invisible to the eye differences in your sample will create large differences in your MSE estimates.\n\nAn Example of Sample Influence on Error\nThe scatterplots below shows two of the training and test sample splits that were used in the phases example. One produced the best test error on the polynomial 15 model (MSE= 105) and the other, the worst (MSE=9837). Is there a remarkable difference?"
+ "objectID": "posts/MLE/index.html#step-2-make-your-likelihood-function",
+ "href": "posts/MLE/index.html#step-2-make-your-likelihood-function",
+ "title": "How a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)",
+ "section": "Step 2: Make Your Likelihood Function",
+ "text": "Step 2: Make Your Likelihood Function\nWow what a beautiful joint PDF… What do we do with it? Well I said in the beginning that the function that gives probability of outcomes and the function that gives probability of parameters is the same function just with a different unknown. Here are the two directions we could take with our joint PMF. \nSince in this case our unknown is the parameter, we are going to use the likelihood function. Here we can actually put find the Likelihood function for our particular birthday party results.\n\nL(\\theta)=\\theta^{18}(1-\\theta)^{6}\n\nBut to simplify it here with the outcomes wouldn’t be an accurate representation of how we usually have to conduct MLE. So I’m going to leave in the product notation. Now that we have a function that shows how likely different values of \\theta (the probability a student turns up) are, we need to find its maximum."
},
{
- "objectID": "posts/bias_variance_flexibility/index.html#how-our-estimation-method-influences-our-test-error-distribution",
- "href": "posts/bias_variance_flexibility/index.html#how-our-estimation-method-influences-our-test-error-distribution",
- "title": "A Deep Dive into How Flexibility Affects The Bias and Variance Trade Off",
- "section": "How Our Estimation Method Influences Our Test Error Distribution",
- "text": "How Our Estimation Method Influences Our Test Error Distribution\nA glaring issue with our test error estimate is its high variance, which means less certainty in the conclusions we draw from our test estimates. If we want a test error estimation method that is less susceptible to this issue of variance, we could try using a cross validation method. All methods, like the test error shown above, will still follow the general phases caused by increasing flexibility, but some have a lower overall variance (at the cost of more bias).\n\nThe Phases Example Using Cross Validation\nWhen we originally looked at the test error, it was estimated with the validation set approach (test in the plot) for simplicity. Now, let’s redo those distribution estimations of error from the mpg and horsepower models, but also look at the distribution of the 10-fold (k10cv), and 5-fold cross (k5cv) validation methods.\n\n\n\n\npng\n\n\n\nHere we can see the bias variance trade off play out with our estimates of test error, just as they would with our model fit. Our cross-validation methods in order of increasing variance are:\n\n5-fold CV < 10-fold CV < Validation Set Method\n\nThe methods in order of increasing bias are:\n\n10-fold CV < 5-fold CV < Validation Set Method\n\nIn general, the k-fold CV bias and variance depends on the value of k, where LOOCV (k=n) is approximately unbiased."
+ "objectID": "posts/MLE/index.html#step-3-math-time-logs-and-derivatives-and-more-derivatives",
+ "href": "posts/MLE/index.html#step-3-math-time-logs-and-derivatives-and-more-derivatives",
+ "title": "How a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)",
+ "section": "Step 3: Math Time : Logs and Derivatives and More Derivatives",
+ "text": "Step 3: Math Time : Logs and Derivatives and More Derivatives\nI mostly wanted to focus on the difference between a PMF/PDF and a likelihood function in this post, but for the sake of completeness I’m going to finish the estimation. That being said I’m not going to be very detailed. Our next step in the process is to take the log.\n\nWhy take the log?\nThe answer is really just mathematicians are lazy. From high school you may remember that when you want to find the maximum of a function you take the derivative and set it equal to 0. The thing is, we have a massive product right now, and the product rule is a pain to do. Especially when we have 24 functions multiplied together. Thanks to log laws, taking the log of this nightmare function both doesn’t change the value the maximum is at (thanks to some math stuff I won’t go into) and also means we have to take the derivative of a big sum instead of a big product, which is really easy.\nL(\\theta)=\\prod^{24}_{i=1}\\theta^{x_i}(1-\\theta)^{1-x_i}\nI’m going to do some cosmetic changes before applying the log.\nL(\\theta)= \\theta^{\\sum_{i=1}^{24} x_i}(1-\\theta)^{24-\\sum_{i=1}^{24}x_i}\nThen we have our log-likelihood.\nlogL(\\theta)= log(\\theta)\\sum_{i=1}^{24}x_i+ log(1-\\theta)(24-\\sum_{i=1}^{24} x_i)\n\n\nThe first derivative\nNow we take the first derivative. When our likelihood function has a rate of change of 0, it’s about to fall back down. So we take the derivative with respect to the value we want to maximise and find the parameter that is the most likely given our set of outcomes.\nlogL'(\\theta) = \\frac1\\theta {\\sum_{i=1}^{24} x_i}- \\frac1{1-\\theta}(24-\\sum_{i=1}^{24} x_i)\nSince the first order condition is that we would like the first derivative to be equal to 0, this is where I put the hat in because this isn’t true in general, only for our estimate.\n\\frac1{\\hat{\\theta}} {\\sum_{i=1}^{24} x_i}- \\frac1{1-\\hat{\\theta}}(24-\\sum_{i=1}^{24} x_i)=0\nWhich we solve to find\n{\\hat{\\theta}} = \\frac1{24}\\sum_{i=1}^{24} x_i\nNow that we have the solutions we can substitute in our values from our sample of party go-ers and get the probability any one person will turn up.\n{\\hat{\\theta}} = 0.75\n\n\nThe second derivative\nThe lazy of us ignore this step, although it is technically still important. I also tend to ignore it, and will do so here for the sake of brevity. Whoops. We already have our estimate, this is more about making sure we have a clean solution. Taking the second derivative ensures our estimate is a maximum, and not some other stationary point."
},
{
- "objectID": "posts/bias_variance_flexibility/index.html#to-summarise",
- "href": "posts/bias_variance_flexibility/index.html#to-summarise",
- "title": "A Deep Dive into How Flexibility Affects The Bias and Variance Trade Off",
- "section": "To Summarise…",
- "text": "To Summarise…\nAs the flexibility of our model increases, we know that the estimated model will have a decrease in bias and increase in variance. This change in our model causes both a change in the mean and variance of our estimated test error. A lot of the difference is caused by the increasing impact of our random sample split, however it is not something that is visually noticeable. Like the model, the method of test error estimation also has its own bias and variance trade off, and it can be balanced using cross validation methods.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
+ "objectID": "posts/MLE/index.html#conclusion",
+ "href": "posts/MLE/index.html#conclusion",
+ "title": "How a 5th Grader Makes $1500 in a Single Night (and How They Could Make More Using MLE)",
+ "section": "Conclusion",
+ "text": "Conclusion\nI used all the money I made from birthday parties to buy about $10 000 worth of manga books because I was a huge Weeb. Sadly I ended up donating them all to the school that expelled me in year 11. Turns out being mercenary enough to make buckets of money as a child doesn’t matter if you waste it all on books you are forced to give away when you move to Melbourne because student accommodation doesn’t come with a wall of free library space. I’m sure there is a lesson in here somewhere.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
},
{
- "objectID": "posts/permutation_variable_importance/index.html",
- "href": "posts/permutation_variable_importance/index.html",
- "title": "Using the Bachelor to Understand Permutation Variable Importance",
+ "objectID": "posts/regularisation/index.html",
+ "href": "posts/regularisation/index.html",
+ "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
"section": "",
- "text": "The season of the bachelor is upon us, and what better way to celebrate my love of drawn out reality TV, than to use it to explain permutation variable importance in the random forest model. For those who are not familiar, The Bachelor is a dating show where each week female contestants are eliminated when they do not receive a rose during the rose ceremony. The winner is famously difficult to predict, and many complicated factors (screen time, number of dates, ect) mean our variables are ever evolving through the season and difficult to use in analysis. Today we will not be predicting the winner of the bachelor (as fun as it sounds) but rather, we will use The Bachelor as the basis of an example in calculating variable importance.\n\nWhat Matters Most When Choosing A Partner\nAnyone who has viewed the show for many years starts to notice a trend in the girls who always make it to the end of the competition. In the image below I have circled the top six participants from last year’s season.\n\n\n\nNotice anything? The girls at the end of the bachelor are overwhelmingly blonde. Of course regular viewers would notice other things too. Like how every season has a group skydiving date that ends with one of the girls crying, overcoming her fear, and getting extra time with the bachelor (when I type this out the show sounds stupid). However we are going to focus on the hair, specifically how we can find out how important hair colour is in separating the winners from the losers.\n\n\nIntroducing Our Bachelorettes\nFor our illustration, let’s make an example competition that consists of 10 people, broken down into their most arbitrary characteristics: name, hair colour, eye colour, and job.\n\n\n\nObviously the real winner isn’t chosen on these characteristics alone, but this is a fake example and my fake bachelor is a shallow guy. First we give all the girls a final position in the fake competition, and assign them to one of three groups: finalists (top 3), place moderately (middle 4), and losers (bottom 3).\n\n\n\n\n\nA Normal Random Forest Model\nBefore we can even look at variable permutation, we need a random forest model. If you need refreshing on how they work, a random forest model will take B bootstrapped samples, and build a tree for each. Usually, just by chance, about a third of the contestants will not be used to build each tree, these are the out of bag contestants.\n\n\n\nTypically, for more complicated data sets, random forest models use a random subset of all the predictors at each node. However, Since we only have 3 predictors, we will ignore that for this example (it won’t have any major influence on our results). This model will have multiple trees, but for simplicity, we are only going to look at the first tree in depth, which is illustrated below.\n\n\n\nContestants 2,5,7, and 9 are our out of bag contestants and so were not used to build the tree. Running these four contestants through the tree we get our out-of-bag (OOB) error.\n\n\n\nNow at this point we have a bootstrapped sample, a tree, and an OOB error for all of the B trees in our forest (but we have only looked at the first in depth). This is the basis of a typical random forest model, and it is also what we will use as a point of comparison when we permute our variables.\n\n\nPermutation Variable Importance\nTo calculate the importance of a variable (in this case hair), we randomly permute that variable among the observations. This creates a new dataset where all the variables are the same EXCEPT for the one variable we are checking. So for the bachelor example, the girls have all the same characteristics as before except their hair colour is randomised.\n\n\n\nRationally, we can tell that if our Bachie isn’t using hair colour as a key decider for his life partner (as we would hope), randomising that variable would have no effect on the girls position in the competition. People getting divorced over dyed hair is no way for a society to function. Again, we calculate our OOB error, using the tree above and contestants 2,5,7 and 9. However, we now take our predictors from the table with the permuted hair variable.\n\n\n\nThis gives us an OOB error for the version of the bachelor where love is colour blind. The difference between the first OOB error and the OOB error for the permuted observations will give us the importance of hair colour in the first tree. We repeat this calculation for all trees in the forest, and take the average to find the overall variable importance. That in a nutshell is how we calculate the permutation variable importance.\n\n\nFinal Comments Before we Leave the Mansion\nIt easy to see the logic behind this method of calculating variable importance. If we are essentially rolling a dice to decide a variable, it shouldn’t be useful in making predictions. If previously that variable was important, we have caused serious damage to the predictive power of our model. While this isn’t a complete computation in variable importance (since we only calculated it for one tree and one variable), it’s purpose is to take a look under the hood of the process, and, hopefully, into the heart of our bachelor.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
+ "text": "Back when I lived with my sister I barely managed to drink once a month, and health wise, I was living a good life. Unfortunately, my sister decided to get engaged and her fiance was all “its weird that we live with your hermit sister” and “you two have been co-dependent for years its unhealthy and it’s time to stop”. When I moved in with my friends at the beginning of the year I was immediately tossed in to a long strict lock down. I had to deal with living in a house of people I liked, all of which had no schedule and were all bored out of their minds, so, to cut a long story short, we all started drinking a lot. I have since significantly cut back (I have gone back to my original “barely once a month” amount), but during our trial period for high functioning alcoholism, our friend Fynn introduced us to a “guess your uni grades” drinking game. Here is how it works:\n\nGuess your mark for all the units you are taking this semester\nWhen you get your results, calculate the difference between your guess and your actual result. e.g. if you guess 51 for Topology and actually get a 73 because the exam was scaled, you get 22 points.\nTake your points for each unit and sum them up. e.g. If topology got you 22 points, data structures 7 points, and a project unit was 3 points, your total is 32 points.\nIf you did not do a 4 unit work load, you scale your points up and round to the nearest integer to match that. e.g. if you had 32 points and did 3 units, your scaled score is 43.\nThe number of points you have is the number of drinks you have to do.\n\nFynn’s version is proper shots. Unfortunately because the example case was based on my housemate Tom, who apparently has next to no idea how he is doing in his courses, we had to change our rules to shots of mixed drinks. Even with this change we calculated that there was still a chance Tom would be killed by alcohol poisoning. After a 3-1 house vote we agreed we were willing to make that sacrifice for the sanctity of the game. My housemates in order of least to most points were:\n\nZac with 4 drinks\nMe with 13 drinks\nEm with 17 drinks\nTom with 43 drinks\n\n\n\n\nA visualisation of both the guessing and social order of the house. Here, Tom has died of alchohol poisoning.\n\n\nThis game led into a discussion about whose grades are the most difficult to predict. For example, there are things that seem to make guessing easier, such as completely in semester units. While things like my and Tom’s history of high variance puts us at a natural disadvantage. The first step to understanding what affects our changes in grades is to predict them. Figure 1 below gives a basic visualization of the house’s grades. The x axis represents the year and semester, however it is mostly arbitrary.\n\n\n\nThe house’s grades for every semester of uni.\n\n\nLooking at this plot we can visually see some of our personality quirks in the data. This plot makes it rather obvious which semester I had a mental breakdown and was diagnosed with ADHD. Em’s consistently high marks show the benefit of grade based anxiety, and the slight drop at the end shows the trade off that occurs when you start looking after your mental health. Zac’s grades all sit around the same range, because despite being a reliably high achiever, he over commits to extra-curricula and often hands assignments in several days late, which essentially stops him from getting higher than an 80. Tom has no reason for being the way he is.\nWe want to try and improve our predictions by building a model that forecasts next semesters results. Fynn’s house had a total of 69 drinks, while we had 77 and losing to another household (especially one as inferior as Fynn’s) is a blight on our competitive record. The problem with building a model is that there is very little data here, especially when compared to the number of variables at hand. This means even something as simple as a linear regression will have too much flexibility and will likely over fit, so to fix this, we need to turn to regularisation."
},
{
- "objectID": "posts/internships/index.html",
- "href": "posts/internships/index.html",
- "title": "Can our Masters students help you?",
+ "objectID": "posts/regularisation/index.html#the-drinking-game-that-killed-tom",
+ "href": "posts/regularisation/index.html#the-drinking-game-that-killed-tom",
+ "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
"section": "",
- "text": "Do you have a data analysis task and want some help with it? We have a lot of Masters students who might be able assist."
+ "text": "Back when I lived with my sister I barely managed to drink once a month, and health wise, I was living a good life. Unfortunately, my sister decided to get engaged and her fiance was all “its weird that we live with your hermit sister” and “you two have been co-dependent for years its unhealthy and it’s time to stop”. When I moved in with my friends at the beginning of the year I was immediately tossed in to a long strict lock down. I had to deal with living in a house of people I liked, all of which had no schedule and were all bored out of their minds, so, to cut a long story short, we all started drinking a lot. I have since significantly cut back (I have gone back to my original “barely once a month” amount), but during our trial period for high functioning alcoholism, our friend Fynn introduced us to a “guess your uni grades” drinking game. Here is how it works:\n\nGuess your mark for all the units you are taking this semester\nWhen you get your results, calculate the difference between your guess and your actual result. e.g. if you guess 51 for Topology and actually get a 73 because the exam was scaled, you get 22 points.\nTake your points for each unit and sum them up. e.g. If topology got you 22 points, data structures 7 points, and a project unit was 3 points, your total is 32 points.\nIf you did not do a 4 unit work load, you scale your points up and round to the nearest integer to match that. e.g. if you had 32 points and did 3 units, your scaled score is 43.\nThe number of points you have is the number of drinks you have to do.\n\nFynn’s version is proper shots. Unfortunately because the example case was based on my housemate Tom, who apparently has next to no idea how he is doing in his courses, we had to change our rules to shots of mixed drinks. Even with this change we calculated that there was still a chance Tom would be killed by alcohol poisoning. After a 3-1 house vote we agreed we were willing to make that sacrifice for the sanctity of the game. My housemates in order of least to most points were:\n\nZac with 4 drinks\nMe with 13 drinks\nEm with 17 drinks\nTom with 43 drinks\n\n\n\n\nA visualisation of both the guessing and social order of the house. Here, Tom has died of alchohol poisoning.\n\n\nThis game led into a discussion about whose grades are the most difficult to predict. For example, there are things that seem to make guessing easier, such as completely in semester units. While things like my and Tom’s history of high variance puts us at a natural disadvantage. The first step to understanding what affects our changes in grades is to predict them. Figure 1 below gives a basic visualization of the house’s grades. The x axis represents the year and semester, however it is mostly arbitrary.\n\n\n\nThe house’s grades for every semester of uni.\n\n\nLooking at this plot we can visually see some of our personality quirks in the data. This plot makes it rather obvious which semester I had a mental breakdown and was diagnosed with ADHD. Em’s consistently high marks show the benefit of grade based anxiety, and the slight drop at the end shows the trade off that occurs when you start looking after your mental health. Zac’s grades all sit around the same range, because despite being a reliably high achiever, he over commits to extra-curricula and often hands assignments in several days late, which essentially stops him from getting higher than an 80. Tom has no reason for being the way he is.\nWe want to try and improve our predictions by building a model that forecasts next semesters results. Fynn’s house had a total of 69 drinks, while we had 77 and losing to another household (especially one as inferior as Fynn’s) is a blight on our competitive record. The problem with building a model is that there is very little data here, especially when compared to the number of variables at hand. This means even something as simple as a linear regression will have too much flexibility and will likely over fit, so to fix this, we need to turn to regularisation."
},
{
- "objectID": "posts/internships/index.html#masters-of-business-analytics-students",
- "href": "posts/internships/index.html#masters-of-business-analytics-students",
- "title": "Can our Masters students help you?",
- "section": "Masters of Business Analytics students",
- "text": "Masters of Business Analytics students\nOur Masters of Business Analytics students are well-trained in the entire workflow of data analysis including data collection, munging, exploratory analysis, modelling and reporting. Our program is based around R, so all students should have relatively advanced R skills. Some of them may also have Python, SQL and other language skills. Our students are also taught to use modern reproducible practices and are comfortable with using git, Rmarkdown, etc.\nTowards the end of their degree, our students take on an internship in which they work with an organization on a data analysis project. The project is for a minimum of 225 hours (30 working days) to be undertaken at a time suitable to the organization and the student. This does not have to be done during normal teaching periods. Normally, the majority of the hours are to be spent on site (or virtually) embedded in the organization. It is not a requirement that students are reimbursed for this work, although some organizations choose to pay students a nominal wage during their internship. (Monash will provide insurance for all students provided they are not classified as employees.)\nSuitable projects should involve a substantial data analysis or modelling component, and should address a problem of interest to the sponsoring organization. At the start of the program, the organization, Monash University and the student must all agree on a suitable project.\nStudents will write a report of about 30 pages outlining the analysis they have undertaken. A copy of this report will be provided to the sponsoring organization, along with all the code that was produced as part of the analysis.\nEach student will need a supervisor from the sponsoring organization who must meet with the student at least weekly to ensure the student is on track. All participating students will also meet regularly (every few weeks) with a Monash University academic who can help with any technical issues.\nAt the conclusion of the project, the supervisor will need to provide a one page report to Monash University on the student’s performance.\nIf you think this might be of interest to your organization, please contact Professor Rob Hyndman."
+ "objectID": "posts/regularisation/index.html#what-is-regularisation",
+ "href": "posts/regularisation/index.html#what-is-regularisation",
+ "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
+ "section": "What is regularisation?",
+ "text": "What is regularisation?\nRegularisation is essentially a systematic way to reduce the variance and increase the bias of a model and improve the overall error through the bias variance trade off. There are quite a few regularisation methods, but I’m not going to go through all of them here. Rather I have summarised three of the more common techniques below.\n\nSubset selection: This technique selects a subset of the predictors to use in the regression. There are three common types of subset selection: forward subset selection, backward subset selection, and best subset selection. Forward subset selection starts with the null model and, at each step, adds the variable that reduces the test error the most, until the model is at a point where the addition of new variables don’t improve the the test error. Backward subset selection does the same but in reverse, it starts with the model containing all the variables and removes predictors until the test error does not improve. Best subset selection makes every possible model (the power set of the predictors) and chooses the one with the minimum error, however this can also over fit and is often computationally infeasible.\nPCA regression: You may remember principal component analysis (PCA) from one of my previous posts as an unsupervised learning method, but it can also be used as a regularisation technique. Using the principal components (which are just linear transformations of the original predictors) as predictors in an OLS regression can reduce the variance of the model.\nShrinkage methods: these methods make a new error function that is the sum of the residual sum of squares (RSS) and a penalty term and selects coefficients by minimising this new error function. The two main methods are lasso, which minimises the function RSS + \\lambda\\sum_{j=1}^{p}|\\beta_j|, and ridge which minimises the function RSS + \\lambda\\sum_{j=1}^{p}\\beta_j^2, where \\lambda is a tuning parameter. These additional penalties force the coefficient estimates to be closer to 0.\n\nThe method used in the example, and the main focus of the rest of this post, will be the shrinkage methods, as they have the most interesting theory and haven’t been explained previously on the blog. Now that we have seen how we perform regularisation, this still leaves the question why it works. There are two main benefits to regularisation, lower error and better model interpretability. I will explain how each of them work below.\nThe first reason to use regularisation is to reduce the variance of our model. Often, we do this implicitly by choosing a simple model due to a lack of data. For example, if we had built a neural network and found that the model had too much variance, we could instead build a random forest as a less flexible alternative. Regularisation is used when our model is already as simple as it can be, e.g. a linear regression or LDA in the case of classification, and it still has too much variance. We can’t get more data, and to remove a level of flexibility from a linear regression would be to predict the average (a constant). Regularization allows us to reduce this error from variance by further restricting the model parameters and thus allowing a model that is even more inflexible than a normal least squares linear regression.\nThe second reason to use regularisation is to improve the interpretability of the model. A large number of unnecessary variables not only introduces error, but also complicate the model. The benefit of using regularisation to improve model interpretability stems from the idea that there are signal and noise variables and we want to keep the signal variables while removing the noise variables. Regularisation removes predictors that have a spurious relationship to the response variable and leave us with less coefficients to interpret."
},
{
- "objectID": "posts/ConvolutionalNeuralNetwork/index.html",
- "href": "posts/ConvolutionalNeuralNetwork/index.html",
- "title": "Mario Party: Destroyer of Friendships and Explainer of Convolutional Neural Networks",
- "section": "",
- "text": "This is The Blog of a Mario Party Master\nIn pre-COVID times, my friends and I would gather around for a fun activity called “lets ruin our friendship by taking Mario party way too seriously”. The night always starts with laughter and few drinks, and ends with me standing on a chair, pointing at my closest friends, and screaming “I’m going to make you cry tears you thought were reserved for the death of your mother”. Once the moment has passed it seems a little dramatic, but at the time, we all truly believe that the speed you can get virtual candy out of a jar is an appropriate measure of human worth.\n\n\n\n\nThe last thing my friends see before I block them\n\n\n\nThere are several games that cause spite, but one called “Absent Minded”, pictured below, always sends us into an argument. Basically, you have 3 characters, and a slowly appearing image, and you have to find out which character is absent as the pictures become clearer. The faster you correctly identify, the more points you receive. I have never lost the game. Additionally there are 3 levels of this mini game, and so 3 different ways the images are shown to you: Jumbled Pictures, Blurred Pictures, and One At a Time.\n Example: the “One At a Time” level \nNow, obviously the levels are meant for humans to play, and not for teaching machine learning, but the challenge each level presents gives us an interesting way to view the concepts. The jumbled picture level can show us how our typical machine learning algorithm will view an image. The blurred picture level shows the benefit of using convolutional neural networks, and the one at a time level can go in the trash! Sorry, not every analogy will fit perfectly into a machine learning theory.\n \n\n\nHow Does The Picture Look to a Computer\nBefore I jump into explaining the concepts, I want to explain how your computer will “see” your image. Statistical models do not have eyes, and so for any picture we want to use, an observation needs to be converted in to a dataset. The process is illustrated below (although the variable size would be each pixel, and not limited by how small I can make my handwriting).\n \nFirst our image is broken up into its individual pixels. For greyscale they are typically given a single number to represent its “darkness”, and colour images are given three different values for red, green, and blue (RGB). This dataset is what will be used to represent your model (although I will use the image rather than the dataset for visualisations).\n\n\nPart One: The Jumbled Picture Level\n\nTheory: What’s Wrong With What We Already Have\nTechnically, we could use any other machine learning algorithm to classify an image. We can call these “bag of pixel” methods as they don’t consider the pixels location in the image. They essentially cut the image up into pixels, shake them up in a bag, toss them out, and classify based off these values. Ultimately, the problem with any “bag of pixel” model, is that it fails to capture the shape of the image independent of its location. This means only images that have the right features in the right area are correctly classified.\n\n\nAnalogy: What Makes the Jumbled Level Hard\n \nThe jumbled picture stage is interesting, because we cannot look at the characters as a whole to identify who is present. Since We cannot identify the pictures using the overall shape of the character, we need to look for the presence of independent key features. This reliance on identifiable features in the correct location is also what identifies our ordinary algorithms.\n\n\nPutting Them Together\nIn reality, this jumbling in our pictures would be at a pixel level, not big chunks, but the minigame is more of a point of illustration rather than a technical tool to understand machine learning. Key features being in the right place can be used successfully to identify images, but ultimately we have “zoomed in” too far to see the relationship between pixels. We can conceptualise this further with an example.\nIf we are trying to identify Mario, cap-looking pixels where Mario’s cap should be make it easy. If we have a picture where Mario doesn’t have his cap, that makes it hard. If we have a single picture where Mario is laying down so his cap is where his feet should be, that makes it even worse.\nThis is essentially the problem with our regular Machine learning algorithms. Key features in the right place make classification easy. No key features makes classification hard. Key features in uncommon places will be incorrectly assumed to be something else, and cause misclassification. This is where the benefits of using a convolutional neural network come in.\n\n\n\nPart 2: The Blurry Image Level\n\n\nTheory: How does a Convolutional Neural Network Work?\nBefore we return to explaining machine learning concepts in terms of Mario Party, lets take a step back, and look at how convolutional neural networks work in general. The illustration below is an overview of the structure of a CNN. The information contained in the image undergoes several transformations using layers that can be classified as either “feature extraction”, or “classification”.\n \n\nFeature Extraction Layers\nFeature extraction is what really sets the CNN apart from other models. These layers make new variables that are more “computer vision friendly”. The process creates nodes that identify certain aspects of an image, such as Yoshi’s Shell or Peach’s hair, and converts them to variables we can use to make predictions. The most common (and technically interesting) layers used in the process are explained below. The “options” are specifically related to building CNN in the R package, Keras.\n\nConvolutional Layers\n \nThe convolutional layer is what makes a neural network, a convolutional neural network. This layer creates a small window (called a kernel), that travels along the image and looks for a specific feature. The kernel_size option selects the size of the grid that is run along the image. Larger grids can overlook important details, while smaller grids can provide too much information and create noise. Typically the standard is somewhere in the range of a (3x3) grid. This information is taken from image to feature map using a filter. The filter is basically the type of feature we are looking for, when we run the kernel over the image. The number of times we do this, each with a unique filter, is the “depth” of the layer. In Keras, that is given by the filter option. As for which filters it uses, that is trained by the model.\nThe only convolutional layer that takes information from the image is the first one. All the following feature maps are computed on the previous feature maps. The new filter is applied to some combination of the previous layers feature maps and thus more convolutional layers mean variables that represent more intricate features.\n\n\nMax Pooling\n Max Pooling is a step in our convolutional neural network that is essentially a dimension reduction of our feature maps. Literally just think of it as doing no transformation to the image, other than shrinking it down. As with all dimension reductions, the goal here is to get rid of the pixels that contain noise (e.g. white space) and keep the pixels that identify the image (e.g. Mario’s cap). This layer reduces our chance of overfitting, and thus is a key player in the bias and variance trade off in convolutional neural networks.\n\n\nHow does it work?\nJust like the original image, feature maps can be seen as a grid of pixels. Max pooling sections each feature map into smaller non-overlapping grids, takes the largest value of those pixels, and moves it on to the next layer. The example illustration above is looking for the darkest pixel on a 2x2 grid. Grid size is important, we want to minimise the bias introduced into the model by keeping the grid small, but also eliminate noise and not make the grid so small the layer does nothing.\n\n\nWhy the Maximum?\nMax pooling is a rather counter-intuitive layer, statistically speaking. Through practice, it seems that the maximum is the measure that minimises this information loss, rather than measures of central tendency as we would expect. As to why, the best analogy I’ve seen for the max pooling stage is from the data sceptic podcast. If you are looking for your keys, and everyone in the group says they don’t have them but one person, you aren’t going to take the median or average value. We are not looking for how much the picture looks like Mario’s cap on average, we are looking for any sign of Mario’s cap.\n\n\n\nClassification Layers\n\nDense\nA dense layer allows takes the nodes from the previous convolutional layers, and make a fully connected layer. This essentially takes our feature maps as inputs and runs them through a typically neural network, which we won’t go into detail about here. Our final classification layer is also a dense layer, that outputs the probabilities of each classification option. This is the final output of the model.\n\n\n\nDropout Layers\nUnlike the previous layers, dropout layers can be placed among the feature extraction or classification layers. In fully connected neural networks its use is quite simple; it samples random nodes to remove from the layer, which prevents overfitting. This interpretation does not follow for dropout layers placed among the feature extraction layers (the reason is matrix multiplication but its not worth going into) however it still helps prevent overfitting. Sometimes the number of pooling layers is limited by the resolution of the image (we can only max pool it to be so small) so if we need an additional measure against overfitting, we can include dropout layers.\n\n\n\nAnalogy: The Blurry Image Level\n \nCircling back to Mario Party, the blurry levels difficulties are different to the jumbled level. Here, we struggle to make out key features, but must use the overall shape and colour to identify the character. As the image becomes clearer, it becomes easier to see, and we are more certain of our answer, however this additional certainty does not come without cost. The longer you wait to select an answer in the minigame, the more likely it is that you lose. This ultimately means that if the differences between characters are too subtle, the amount of time it will take to identify the difference isn’t worth the cost, and we are better off guessing.\n\n\nPutting It Together\nWaiting for certainty in the minigame is similar to overfitting in our convolutional neural networks. The deeper we go, the more likely it is that we overfit, and the more computationally heavy it is. We can add in dropout layers, but eventually there is an upper limit on the certainty we can have in our prediction. Unlike the normal models however, CNNs can capture shape AND the key features, they just need to be deep enough.\n\n\nMario Party and Convolutional Neural Networks: A Neat Analogy or Desperate Stretch to Make Statistics Seem Fun?\nObviously the CNNs have more nuance to them than can be explained using a Mario Party minigame, but it doesn’t do a bad job of giving us a conceptual understanding. Normal algorithms are limited by their inability to identify characters independent of their location, an issue we can circumvent using CNNs. CNNs capture the shape and general features of a character. Although really the most important learning experience from this post should be that if you come to my house to play Mario Party you might end up dead in a ditch somewhere.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
+ "objectID": "posts/regularisation/index.html#example-do-the-grades-need-a-regularised-model",
+ "href": "posts/regularisation/index.html#example-do-the-grades-need-a-regularised-model",
+ "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
+ "section": "Example: Do the grades need a regularised model?",
+ "text": "Example: Do the grades need a regularised model?\nTechnically we don’t need a reason to run a regularised model, it is just another method we can use to balance the bias and variance trade off, but in cases where there is a very small amount of data it is more useful to do than not. In this example we want to predict the grade of each unit in the up coming semester using several factors, such as student, department, whether the unit was in semester 1 or 2 (I suspect we do worse in semester 2 due to burn out), the level of the unit (most of us should do better in first year units), whether the unit was online, etc. There are also several interaction terms that could be useful, for example an interaction term between the Harriet student dummy and the online semesters would capture the later jump in my grades. There are obviously more interesting (and useful) variables we could include, such as whether we needed to move house during semester, if we went on a holidays during midsemester break, if we were seeing a therapist, etc. These variables would likely produce a better prediction and more easily interpreted coefficients, however I’m going to keep the model simple and leave it as is. Once we have our data set we can immediately see two reasons to use a regularised model over a normal OLS.\nFirst of all, the matrix is full rank, that is, we have variables that are a linear combination of other variables in the data set. For example, Tom and I are the only two students who take maths units (MTH), so with the student other department variables, the MTH variable becomes obsolete. There are several other variables with this issue. I’m not sure which variables are better to keep (department or student) and this issue will likely get worse as I add interaction terms.\nSecond of all, with such a small data set, any model with more than a handful of predictors will have a large amount of variance. Figure 2, below, shows the test and training error of a simple linear model that’s flexibility has been increased (somewhat arbitrarily) with the addition of new predictors. In this plot, a 0 in flexibility indicates a model that predicts the mean, while an 8 indicates a model that contains all the predictors in the data set as well as every every interaction term. This plot only shows the change in mean squared error (MSE) over a single sample of the data. To see the MSE of the model over several samples (and properly assess the variance of each model) we should do some resampling.\n\n\n\nThe trainning and test error compared with model complexity.\n\n\nFigure 3 shows the density of the test and training error of 50 samples of:\n\na basic linear model which predicts the mean of the training set for all test observations\na simple linear model which is an OLS model with only a handful of predictors I selected\na complex linear model which is an OLS model with every predictor and their interaction terms.\n\nThis gives us a cross validated version of Figure 2, and confirms what the previous plot indicated. First of all, it shows, a basic model has slightly too much bias because the training and test error are, on average, higher than the error of the simple model. It also shows that the complex model has over fit the data, given its consistently low training error and high unpredictable test error. We need a model that is somewhere between the complex “every variable” model and a constant. To find this model, we will use regularisation, specifically a shrinkage method.\n\n\n\nDensity plots of the training and test error of three linear models that differ in flexibility."
},
{
- "objectID": "posts/hackathon_2024/index.html",
- "href": "posts/hackathon_2024/index.html",
- "title": "Hackathon 2024",
- "section": "",
- "text": "Overview\nThe fourth NUMBAT hackathon was held May 28-30, 2024 in Daylesford, Vic. A hackathon is style like RopenSci events where attendees brainstorm potential projects, and join to gether in small groups to tackle a subset of these.\nProjects\nProjects tackled can be found on github.\n\n\n\nParticipants\n\n\n\n\n\nWorking on the projects\n\n\n\n\n\nWorking on the projects\n\n\n\n\n\nWorking on the projects\n\n\n\n\n\nand great food"
+ "objectID": "posts/regularisation/index.html#shrinkage-methods",
+ "href": "posts/regularisation/index.html#shrinkage-methods",
+ "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
+ "section": "Shrinkage Methods",
+ "text": "Shrinkage Methods\nThe most common regularisation methods are ridge and lasso regressions. Lasso and Ridge follow the same general idea, which is to put additional restrictions on the coefficients of a linear regression, they only slightly differ on how they go about it. Lasso, will minimise RSS + \\lambda\\sum_{j=1}^{p}|\\beta_j|, and ridge will minimiser RSS + \\lambda\\sum_{j=1}^{p}\\beta_j^2. The turning parameter \\lambda decides how much the penalty term influences the final coefficient values. A large value of \\lambda means the penalty term outweighs the RSS and coefficients are estimated at 0, a small value of \\lambda means the penalty will not be factored in at all and the model will return the OLS coefficient estimates. Figure 4 shows a contour plot of the lasso penalty, RSS function, and lasso error term for a two variable model. The animation shows the contours of the lasso regression look more like the contour plot of the penalty term as \\lambda increases. In turn we can see the minimum value of the error function (and thus the estimated coefficients) moves from the OLS estimates (the minimum of the RSS) to 0 (the minimum of the penalty).\n \nWhat may not be clear from this animation, but does simplify our ability to visualise how this adjusted error works, is that for every value of \\lambda there is some value of s such that we are minimising the RSS subject to the constraint \\sum_{j=1}^{p}|\\beta_j| \\leq s in the case of lasso and \\sum_{j=1}^{p}\\beta_j^2 \\leq s in the case of ridge. This means that instead of trying to think about a complicated constantly changing error function, we picture our restraints as shown in the illustration below. Here I have drawn a contour plot of a hypothetical RSS for a two variable model. The plot on the left has the ridge regression constraints drawn over it, while the plot on the right has the lasso constraint. The size of the circle/diamond is related to the tuning parameter \\lambda. When \\lambda=0 the area of the circle/diamond is infinite, and when \\lambda \\rightarrow \\infty the circle/diamond becomes so constrained it forces every coefficient to 0. This allows us to see how the constraint impacts the selected coefficient estimates.\n\n\n\nAn illustration of the difference between the ridge and laso regression constraints\n\n\nSomething that is important to note is that lasso regression is more likely to set coefficients to 0 (and thus more likely perform feature selection) than ridge due to the diamond shape of the constraint. The minimum RSS value in figure 4 showed this in practice, as the minimum coefficient estimate quickly set \\beta_1 to 0 before further restricting \\beta_2. Most commonly we will visualise the way the coefficients change as \\lambda increases with a plot of the coefficient vs \\lambda values, as drawn below.\n\n\n\nAn illustration of how the coefficients change as lambda increases.\n\n\nThere is one final question we need to answer before we move on. How do we decide whether to use ridge or lasso regression? Well, if you think all the variables are relevant, use ridge regression, if you suspect some variables to just be noise, use lasso. Now, with an understanding of how shrinkage methods work, we can go back to our example."
},
{
- "objectID": "posts/election_hexmaps/index.html",
- "href": "posts/election_hexmaps/index.html",
- "title": "Hexmaps with sugarbag make it easier to see the electoral map",
- "section": "",
- "text": "Australia is a land of wide open spaces where the population concentrates in small areas. It can make for misleading map visualisations on statistics related to people. The May 20, 2022 ABC article The Australian election map has been lying to you explains this very neatly. It has alsp provided a better alternative to examine election results, in the form of a hexmap of Australia. The hexmap provided in the article is almost certainly manually constructed which is find for a construct once, use many times purpose.\nWhen you want to be able to make a hexmap on new spatial data or if the spatial groups change, the R package sugarbag can be helpful. This post explains how to do this, using the results as we have them today from yesterday’s election. (We’ll update these once the final results are released.)\nHere’s how to get started. Download the current spatial boundaries for electorates, from Australian Electoral Commission web site.\nLoad the libraries we need:\n\nlibrary(ggthemes)\nlibrary(sf)\nlibrary(sugarbag)\nlibrary(tidyverse)\nlibrary(plotly)\n\nRead in the spatial polygons, defining the boundaries. These files can be very large, and slow to draw. For these visualisations faster to draw is more important, so the boundaries can be simplified using rmapshaper::ms_simplify.\n\n# Spatial polygons\nelectorates <- sf::st_read(\"2021-Cwlth_electoral_boundaries_ESRI/2021_ELB_region.shp\")\nelectorates_small <- electorates %>% rmapshaper::ms_simplify(keep = 0.01, keep_shapes = TRUE)\n\nNext we need the election results. The ones here are manually constructed from the ABC results website. These results are joined to the map polygons, and colours are manually constructed to be one typically used by the party. The ggplotly() function enables labels to pop up on mouseover.\n\n# Read in data on current electoral results\nnew <- read_csv(\"electoral_2022.csv\") %>%\n select(Electorate:Party)\nnew_major <- new %>%\n mutate(Party_maj = fct_collapse(Party,\n LNP = c(\"LIB\", \"LNP\", \"NAT\")))\nelectorates_small <- electorates_small %>%\n left_join(new_major, by=c(\"Elect_div\"=\"Electorate\"))\nmap <- ggplot() +\n geom_sf(data=electorates_small,\n aes(fill = Party_maj,\n label=Elect_div),\n colour=\"white\") +\n scale_fill_manual(\"\", values=c(\"ALP\"=\"#E13940\",\n \"LNP\"=\"#1C4F9C\",\n \"GRN\"=\"#009C3D\",\n \"KAP\"=\"#906E3E\",\n \"CA\"=\"#FFC000\",\n \"IND\"=\"#66b2b2\",\n \"UNDEC\"=\"#808080\")) +\n theme_map()\nmap\n\n\n\n\n\n\n\n#ggplotly(map)\n\nAn interactive version can be found here.\nThe map is blue – it looks like the coalition won the election in a landslide, doesn’t it! (Please note the strange shape of the Cape of York is from the AEC spatial polygons provided! It is not due the the polygon thinning.)\nTo convert this into a hexmap, automatically with sugarbag, we need to\n\nFind the centroids of each polygon.\nCreate a hexagon grid with a desired size of hexagon, hs controls this.\nAllocate electorates to a spot on the grid.\nTurn the hexagon centroids into hexagons.\nJoin with election results.\nMake it interactive using ggplotly().\n\n\n# Find centroids of polygons\nsf_use_s2(FALSE)\ncentroids <- electorates %>%\n create_centroids(., \"Elect_div\")\n\n## Create hexagon grid\nhs <- 0.8\ngrid <- create_grid(centroids = centroids,\n hex_size = hs,\n buffer_dist = 5)\n\n## Allocate polygon centroids to hexagon grid points\nelectorate_hexmap <- allocate(\n centroids = centroids,\n hex_grid = grid,\n sf_id = \"Elect_div\",\n ## same column used in create_centroids\n hex_size = hs,\n ## same size used in create_grid\n hex_filter = 10,\n focal_points = capital_cities,\n width = 35,\n verbose = FALSE\n)\n\n# Make the hexagons\ne_hex <- fortify_hexagon(data = electorate_hexmap,\n sf_id = \"Elect_div\",\n hex_size = hs)\nelectorate_hexmap_new <- e_hex %>%\n left_join(new_major, by=c(\"Elect_div\"=\"Electorate\"))\nhexmap <- ggplot() +\n geom_sf(data=electorates_small,\n fill=\"grey90\", colour=\"white\") +\n geom_polygon(data=electorate_hexmap_new,\n aes(x=long, y=lat,\n group = hex_id,\n fill=Party_maj,\n label=Elect_div)) +\n scale_fill_manual(\"\", values=c(\"ALP\"=\"#E13940\",\n \"LNP\"=\"#1C4F9C\",\n \"GRN\"=\"#009C3D\",\n \"KAP\"=\"#906E3E\",\n \"CA\"=\"#FFC000\",\n \"IND\"=\"#66b2b2\",\n \"UNDEC\"=\"#808080\")) +\n theme_map()\nhexmap\n\n\n\n\n\n\n\n#ggplotly(hexmap)\n\nAn interactive version can be found here\nAnd that’s it! The sugarbag hexmap will expand the densely populated small areas outwards, while maintaining proximity to neighbouring electorates and to the city centre. It is a type of cartogram algorithm with two important differences: (1) uses equal area for each hexagon instead of sized proportional to population, and (2) allows some hexagons to be separated so that the geographic positions are reasonably preserved.\nThe hexmap makes it easier to see the results distributed across the country, and clearly with the predominance of red, that Labor won.\nData for this post can be found here.\nThis work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License."
+ "objectID": "posts/regularisation/index.html#predicting-the-grade",
+ "href": "posts/regularisation/index.html#predicting-the-grade",
+ "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
+ "section": "Predicting the Grade",
+ "text": "Predicting the Grade\nLets apply this theory to our grades model to see if we can improve our predictions. Some of the variables are linear combinations of others, so there is absolutely no need to keep all the predictors. This means we should opt for lasso over ridge regression, although this does have one downfall. This example has a large number of interaction terms, and when we include interaction terms, we typically need to maintain a hierarchy so our variables are interpretable, e.g. we need to keep the Harriet and the Online dummy variables in the model if we want to include the Harriet:Online interaction term. Ridge and lasso regression do not follow this hierarchy when they shrink variables. Usually this would make predictability worse, however since every single predictor in this data set is a dummy variable, it isn’t going to cause (too) much of an issue. The main problem will be having almost no idea what the base line model is. From this point forward we will mostly focus on the improvements in test error, and continue with the lasso regression.\nTo find our lasso model, we need a \\lambda value. The best way to find this value is with cross validation, and thankfully the glmnet package does this for us. Figure 5, below, shows the mean test MSE and 95% confidence interval of the lasso regression for several values of \\lambda. The vertical dotted line indicates the \\lambda value that minimises the model error.\n\n\n\nSelecting our lambda value with the glmnet package’s cross validation method.\n\n\nWe can also visualise how our coefficients change as \\lambda increases. Figure 6 shows the change in the model coefficients as we allow \\lambda to approach 0 (or our L1 Norm to get infinitely large as shown on the x axis). The dashed line indicates the model associated with the \\lambda value found from cross validation. This allows us to better understand how some coefficients interact with each other. For example the Harriet:online interaction is the largest coefficient in every model, regardless of the \\lambda value, which indicates it is a consistently useful variable.\n\n\n\nThis plot shows the impact on our variables of a decreasing lambda (and thus increasing L1 norm).\n\n\nThe model that contains every variables as well as every student, unit level, department and online interaction term has 54 variables, the regularised model has only 20 variables, so there has been some serious culling. Figure 7 shows the predictors that made it into the final model. Since the baseline model (the one that we compare each dummy variable to) is now a mess, these coefficients are almost impossible to interpret.\n\n\n\nThe lasso model coefficients.\n\n\nFinally, we can compare the lasso model to the basic, simple, and complex models from figure 3. Figure 8 compares the cross validated RMSE of the three old models and the new lasso model. We can see that the simple model (that was just the student and online variables as well as all their interaction terms) may slightly outperform the lasso model, however there is so much overlap in the confidence intervals it is hard to say. In this example, the lasso model did not select variables that were better than my general intuition. Lasso can help you regularise to some degree, but even regularisation techniques can be given too many predictors to choose from, and it seems my intuition was enough to beat it this time.\n\n\n\nThe RSME of the final lasso model when compared to the previous models over several resamples."
},
{
- "objectID": "posts/reducing-teaching-duplication-with-unilur/index.html",
- "href": "posts/reducing-teaching-duplication-with-unilur/index.html",
- "title": "Reducing duplication in teaching materials",
- "section": "",
- "text": "As a young, impressionable undergraduate, a Computer Science lecturer once told me:\nWe’ve all done it. Whether it’s hard-coding a time period and then changing dataset, or whether it’s hard-coding a population size in a model that’s quoted on national television, we all do our best to keep code clean to avoid these kinds of pitfalls.\nSo why, then, does it seem like so many of us have at least two copies of every tutorial and assignment? One with just the questions, and one with the solutions on it? I understand that both types of files are required, at least until generative AI makes us fully change our assessment, but the idea of having two identical questions in two different files makes me very nervous.\nIn fact, earlier this year while teaching a unit for the second time, one of my tutors pointed out that the solutions had a different set of numbers in the question compared to the ones students were answering. Unfortunately, this also materially changed the interpretation of the answer, and so I had to go through, re-issue solutions and re-mark a pile of assignments.\nAt EBS, a large portion of the content for our units are managed through RMarkdown, in a reasonably standardised format. As a sign of the times, every time I inherit a unit, I try to port material over to Quarto. It just feels like the right thing to do. Given the ability for both of these systems to output multiple formats on render, I started thinking about how to have one master question file that could output both the student question set, and the full solutions at the same time."
+ "objectID": "posts/regularisation/index.html#conclusion",
+ "href": "posts/regularisation/index.html#conclusion",
+ "title": "Trying to Save My Housements (Specifically Tom) from Alcohol Poisoning with Regularisation",
+ "section": "Conclusion",
+ "text": "Conclusion\nRegularisation can be used to reduce the variance and improve the interpretability of our model, but human intuition can still outperform it if we know enough about our data. That being said the models for our grade predictions turned out to be useless. Results for this semester have been released and Tom was 32 off, while the simple model was 55 off. Really, the lesson here is that no model, no matter how fine tuned, can predict a response variable that has an inexplicably high variance."
},
{
- "objectID": "posts/reducing-teaching-duplication-with-unilur/index.html#a-clunky-first-attempt",
- "href": "posts/reducing-teaching-duplication-with-unilur/index.html#a-clunky-first-attempt",
- "title": "Reducing duplication in teaching materials",
- "section": "A clunky first attempt",
- "text": "A clunky first attempt\nI spent a bit too much time starting at CSS and ad-hoc web development, even though the results are sometimes nice. When all you have is a hammer, everything looks like a CSS-based nail. Enter the chunk option:\n```{r, echo=solutions}\nlibrary(readr)\ncovid_cases <- readr::read_csv(\"https://docs.health.vic.gov.au/covid19/data/NCOV_cases_by_postcode_LGA.csv\")\n```\nThis is pretty straightforward. I set the variable solutions at the top of the file to be TRUE if I want solutions to be printed, and false otherwise. With a bit of fanangling, you could also pass this into params in the YAML at the top of the document, and then two calls to rmarkdown::render() with the relevant parameters would probably get you what you want. With a bit of styling:\n```{css}\n.solution {\n padding: 10px;\n border: 1px solid black;\n margin-bottom: 5px;\n}\n\n.solution { background-color: #F9E79F; }\n\n.solution::before {\n font-weight: bold;\n font-style: italic;\n}\n\n.solution::before { content: \"Solution\";}\n```\nyou can even make it look nice. It was a good first attempt, and saw me through for that semester. But, I still felt like there had to be a cleaner way, and something I could generalise to Quarto (which loses me the luxury of the rmarkdown::render() call)."
+ "objectID": "projects.html",
+ "href": "projects.html",
+ "title": "Projects",
+ "section": "",
+ "text": "WOMBAT 2022\n \n\n \n Workshop Organised by the Monash Business Analytics Team 2022 Communicating with Data\n \n\n \n \n \n \n \n \n \n \n \n \n\n \n\n \n \n\n \n \n \n \n \n \n \n \n \n tidyverts\n \n\n \n R packages for tidy time series analysis\n \n\n \n \n \n \n \n \n \n \n \n \n\n \n\n \n \n\n \n \n \n \n \n \n \n \n \n Quarto\n \n\n \n Quarto resources designed for use at Monash University\n \n\n \n \n \n \n \n \n \n \n \n \n\n \n\n \n \n\n\n\n\nNo matching items"
},
{
- "objectID": "posts/reducing-teaching-duplication-with-unilur/index.html#inheriting-a-solution",
- "href": "posts/reducing-teaching-duplication-with-unilur/index.html#inheriting-a-solution",
- "title": "Reducing duplication in teaching materials",
- "section": "Inheriting a solution",
- "text": "Inheriting a solution\nFor the first time this year, I am running a subject on my own. As I can’t really do anything independently in my life, I set about borrowing all the relevant templates for the LMS and converting handbook entries. At some point, I should share the Moodle template - it really is something, even though it isn’t mine!\nI was very fortunate. I inherited a very clean set of lecture notes from Emi Tanaka, that had even been ported to Quarto already. Bar some minor changes, I’ll teach this course as I got it in it’s first year, if for no other reason than so I can learn it fully!\nWhile in my pre-semester prep, I noticed that the tutorials are still Rmd files. Nothing else in the course is, so why these I wondered?\nIt turns out, in what shouldn’t be surprise, someone else had already engineered this master-copy-to-2-outputs solution:\n---\ntitle: 'ETC5523: Communicating with Data'\nsubtitle: \"Tutorial 1\"\nauthor: \"Michael Lydeamore\"\ndate: \"Week 1\"\noutput: \n unilur::tutorial_html_solution: \n toc: true\n suffix: \"-solution\"\n unilur::tutorial_html: \n toc: true\n suffix: \"-question\"\n---\nThe unilur package wsas designed for exactly this. I must say, it doesn’t look particularly active on GitHub, but nonetheless, it worked perfectly. With an extra argument to rmarkdown::render(output_format = \"all\"), two documents are produced, with the relevant suffixes on them. Brilliant!\nThe styling I inherited was a little egregious (I really can’t do white text on a green background), but it did the job. And no more multiple copies of files for me to royally screw up when I’m in a hurry.\nThe only problem is, it’s an Rmarkdown output format, not a Quarto one. And literally everything else in this course is Quarto. There’s even lectures dedicated to it. I stumbled upon a Quarto port, converting it to an extension, which was a great start. The system the author had gone with was to add a chunk option:\n```{r}\n#| unilur-solution: true\n```\ncombined with a YAML boolean show-solution: true, which you could change to generate the two files. I ported over the first tutorial reasonably quickly, and it performed almost as expected. I had nice expandable dropdowns for solutions:\n\nThe only problem was combining code chunks with text, which in a course all about how to explain your story, was quite important. This is because the only way to get a solution block to generate is with the chunk option, and so text solutions have to be enclosed in a block type. Embedding code chunks into a block… uh… block does work, but you lose syntax highlighting, and the ability to execute them if you want the solutions to actually compile.\nThe clunky solution to this is to have two solution blocks, and write the answer in such a way that the code comes after the text. I guess this is somewhat traditional, it’s certainly how I explained algorithms in my PhD thesis, but it is difficult for markers, and a bit jarring for students to have to jump around the page. Not quite as bad as figures at the end of a manuscript, but it has similar vibes. I don’t like it.\n\nDiving into the extension\nLike all Quarto extensions, the meat of what’s going on is in the _extensions/ folder. There’s a YAML file that gives defaults for what is added by the extension (which here is just a contributes for a LUA filter), some CSS, and a LUA filter. I can’t really write LUA, certainly not good LUA. My only experience of it is a terrible mod I made many years ago for Factorio, which as an aside is a brilliant game everyone should play.\nBut, I have enough CS experience and training to be able to read these things and clunk my way through them. Thankfully, this LUA filter isn’t particularly long. The Div function just checks if we’re in the right sort of thing (so a “cell”, and has the attribute unilur-solution == TRUE), and if it’s all good, spits out the solution. Otherwise, instead of returning the content (which would be el in this code), we return a totally empty element.\nThis means that extending the functionality to include a class is pretty easy. Just check if the class of the div (which for reasons unknown to me is always capitalised in LUA) is also called unilur-solution1:\n\nif (el.classes:includes(\"cell\") and el.attributes[\"unilur-solution\"] == \"true\") or (el.classes:includes(\"unilur-solution\")) then\n... do stuff ...\n\nThis worked a treat. Now I can use a Pandoc fenced div to specify solutions with text and code, and the code is highlighted and executed exactly as it would be on a normal block. The system works like this:\n\n::: unilur-solution\n\nHere is a solution that is inside a div. The contents of this will only be shown\nin the solution document.\n\n:::\n\nNow the last step: getting multiple documents to output from one master file, from one quarto render call.\n\n\nMultiple formats at once\nQuarto supports multiple format documents out-of-the-box, and for the most part, they work pretty well, minus a quirk with RStudio only rendering one of the formats. Use the CLI for that by the way, it works much cleaner.\nThere’s even support for formats with the same file extension so they don’t overwrite each other. Just add output-file to your YAML and you can generate two HTML files.\nSo this was looking pretty easy. I’ll just convert the extension to an output format, include it twice in the header of my tutorial and off we go. In case you ever need it, here’s how you can specify your own output format:\ntitle: Unilur Questions\nquarto-required: \">=1.3.0\"\ncontributes:\n format:\n common:\n filters:\n - unilur.lua\n show-solution: false\n html:\nYou set your document type as the folder your extension is in, plus the format you want. So in this case, I have:\nformat:\n unilur-question-html:\n output-file: tutorial-01-question.html\n embed-resources: true\nI forgot the -html the first time and caused myself a lot of pain.\nUnfortunately you can’t have the same output format twice, just changing the YAML options. I don’t understand why, I won’t pretend to understand why, and in the kindest way, I don’t think I really want to understand why.\nSo, sadly, the solution was to have two almost identical extensions: unilur-question, which defaults show-solution: false, and unilur-solution, which defaults to the opposite. Still two files, I guess, but it ends up being not too bad."
+ "objectID": "img/FlexiblevsInflexible/flexinflex.html",
+ "href": "img/FlexiblevsInflexible/flexinflex.html",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "",
+ "text": "A few years ago my mum became very susceptible to suggestion, and made friends with a guy who was really good at speaking about nonsense with the authority to make it sound true. Listening to him sent her down a conspiracy theory rabbit hole, of which I had to experience second hand. Our interactions ended up boiling down to mum sending me a 20 minutes Youtube video about aliens building the pyramids, then I would wait the appropriate amount of time and send a text that said “Wow, what an interesting perspective”. I always hoped it would end the conversation and we could talk about something else, but instead it tended to inspire a paragraph long text rant about how the government was hiding free energy from us, and an 11 year old Texan genius had discovered the plot. When I think of flexible methods, I often have flash backs to that period of my life. Not because high degree polynomials were built by ancient aliens or an 11 year old genius but because we can use the pitfalls of conspiracy theories to understand the difference between flexible and inflexible methods.\n \n\n\nI think of flexibility as the trade off in capturing the “local” and “global” trends in our data. An inflexible model will capture the global trend of the data, but any relationship between our variables is lost. If we instead choose a flexible model, we are focusing on the local trends and giving our model a better chance at capturing variable relationships, bit to overfit to the sample. Flexibility has key interactions with 4 other elements of our model: the sample size, dimensionality, assumptions about the function, and irreducible error."
},
{
- "objectID": "posts/reducing-teaching-duplication-with-unilur/index.html#the-workflow",
- "href": "posts/reducing-teaching-duplication-with-unilur/index.html#the-workflow",
- "title": "Reducing duplication in teaching materials",
- "section": "The workflow",
- "text": "The workflow\nSo now, it’s pretty straightforward. You can use the extension like any other Quarto extension2:\nquarto install extension MikeLydeamore/unilur\nwhich will install both identical extensions. Set your YAML as per above, fence off your solutions using either the chunk option or the div, and quarto render your way to what is in my opinion, a much cleaner workflow.\nThe YAML header becomes:\nformat:\n unilur-question-html:\n output-file: tutorial-01-question.html\n embed-resources: true\n unilur-solution-html:\n output-file: tutorial-01-solution.html\n embed-resources: true\nwhich I think is pretty clean. It’s not quite the suffix tag that’s in the original extension, but I think I can live with that. If anyone knows a way to get the current name of the file in the YAML (or possibly elsewhere in extension-land), I’d love to hear it so I don’t need output-file anymore.\nIf you use this, or have other solutions, I’d love to hear about them. Please let me know however you see fit: Twitter, GitHub, or e-mail."
+ "objectID": "img/FlexiblevsInflexible/flexinflex.html#a-conspiracy-theory-is-like-a-bad-model",
+ "href": "img/FlexiblevsInflexible/flexinflex.html#a-conspiracy-theory-is-like-a-bad-model",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "",
+ "text": "A few years ago my mum became very susceptible to suggestion, and made friends with a guy who was really good at speaking about nonsense with the authority to make it sound true. Listening to him sent her down a conspiracy theory rabbit hole, of which I had to experience second hand. Our interactions ended up boiling down to mum sending me a 20 minutes Youtube video about aliens building the pyramids, then I would wait the appropriate amount of time and send a text that said “Wow, what an interesting perspective”. I always hoped it would end the conversation and we could talk about something else, but instead it tended to inspire a paragraph long text rant about how the government was hiding free energy from us, and an 11 year old Texan genius had discovered the plot. When I think of flexible methods, I often have flash backs to that period of my life. Not because high degree polynomials were built by ancient aliens or an 11 year old genius but because we can use the pitfalls of conspiracy theories to understand the difference between flexible and inflexible methods.\n \n\n\nI think of flexibility as the trade off in capturing the “local” and “global” trends in our data. An inflexible model will capture the global trend of the data, but any relationship between our variables is lost. If we instead choose a flexible model, we are focusing on the local trends and giving our model a better chance at capturing variable relationships, bit to overfit to the sample. Flexibility has key interactions with 4 other elements of our model: the sample size, dimensionality, assumptions about the function, and irreducible error."
},
{
- "objectID": "posts/reducing-teaching-duplication-with-unilur/index.html#footnotes",
- "href": "posts/reducing-teaching-duplication-with-unilur/index.html#footnotes",
- "title": "Reducing duplication in teaching materials",
- "section": "Footnotes",
- "text": "Footnotes\n\n\nQuarto (and Rmarkdown for that matter) doesn’t have a LUA highlighter so you’ll have to read this one yourself.↩︎\nPending an open pull request↩︎"
+ "objectID": "img/FlexiblevsInflexible/flexinflex.html#outrageous-claims-need-outrageous-evidence",
+ "href": "img/FlexiblevsInflexible/flexinflex.html#outrageous-claims-need-outrageous-evidence",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "1: Outrageous Claims Need Outrageous Evidence",
+ "text": "1: Outrageous Claims Need Outrageous Evidence\nMy mother is a “bit eccentric” to put it mildly. In the last few months, to only name a few things, she has bought a fire truck to start mud-crabbing (pictured below), bought some goats because the garden is a pain to manage, and turned the pool into a “fish Club Med” where she collects wildlife from the local creek and feeds them McDonalds for breakfast. From expulsions to arrest warrants, to the man she drank goon with at the beach who now lives in our house, the stories are endless. Despite this, never in my life had I ever been called a liar for telling them (the first time was at university orientation). People at my school had grown used to it, they had met my family and heard years worth of stories so I had a wealth of evidence to normalise my claims. Strangers didn’t have that, and so they didn’t believe my outrageous (completely true) claims. Similarly in statistics, if we want a complicated model we will need a large sample size to back it up.\n\nWhy Flexible Models Need a Bigger Sample\nIn general, the larger your sample size, the more likely it is you have captured the “true relationship”. If you are increasing the number of parameters to estimate (not literally for non-parametric models but the idea carries on) without increasing the sample size, we are in effect decreasing the “sample size” for each of the predictions, and thus decreasing the reliability of our model. Placing more weight on all the observations in calculating our estimates, means we are increasing the influence of outliers and unrepresentative samples. We can either have observations contributing to a large area but averaged over many observations, or over a small area where our estimates are averages over fewer observations. For example, If we have 10 observations and predict using the average, each observation contributes to 1/10th of the prediction, but all have the same. If we use 1-Nearest Neighbour, each prediction is only backed up by a single observation (illustrated below), however it is highly tailored to any relationships that may be specific to . Highly flexible models can be, and sometimes are, the appropriate choice to model a relationship, we just need a large sample to justify it. Outrageous claims need outrageous evidence."
},
{
- "objectID": "posts/secret-santa-2023/index.html",
- "href": "posts/secret-santa-2023/index.html",
- "title": "Secret Santa 2023",
- "section": "",
- "text": "Overview\nWrapped up the year in style at the NUMBATs end-of-year bash on Nov 22nd, 2023, at Clayton Campus! Secret Santa gifts exchanged, and we kicked back with a few interesting games for a fun-filled time!\n\n\n\nParticipants\n\n\n\n\n\nIncredible present of socks featuring numbats designs made by Di Cook’s mother"
+ "objectID": "img/FlexiblevsInflexible/flexinflex.html#the-internet---deliverer-of-facts-and-local-cult-meet-ups",
+ "href": "img/FlexiblevsInflexible/flexinflex.html#the-internet---deliverer-of-facts-and-local-cult-meet-ups",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "2: The Internet - Deliverer of Facts and Local Cult Meet Ups",
+ "text": "2: The Internet - Deliverer of Facts and Local Cult Meet Ups\nThe introduction of the internet was the age of new information. Conspiracy theories were on their way out, now anyone can use their phone and find the facts in seconds. Or can they? What I unfortunately discovered when mum got involved with conspiracy theories, is that for every website with legitimate information, there are 50 that don’t. The sheer vastness of the internet means that whenever we expand our search for hidden truth, we are just as likely to discover falsities. This is a useful illustration in dimensionality.\n\nFlexible Models Are Hurt More By Additional Parameters\nDimensionality interacts with the flexible vs inflexible models in two ways. The first is that in some occasions adding dimensions can literally be seen as making the model more flexible. Think of adding a squared variable to a linear regression to make it quadratic, we have made the model more flexible by adding a dimension. The second way it interacts with our models, is by increasing the distance between observations, and thus increasing the domain of each variable. To get technical, each additional parameter makes the area each observation is responsible for increase exponentially. Just like how increasing flexibility increases the “weight” of observations by localising their impact on the model, dimensionality makes the total “area” bigger, and so it does a similar thing. Sometimes the relationship between our variables needs to be modeled with a highly flexible model, and so we need to keep this interaction between flexibility and dimension in mind so our model variance doesn’t get out of control."
},
{
- "objectID": "events.html",
- "href": "events.html",
- "title": "Events",
- "section": "",
- "text": "We hold regular seminars during the semester period (March-June, August-Nov). And there are numerous other events organised or related to NUMBAT.\nYou can add the events to your calendar via iCal or Google calendar."
+ "objectID": "img/FlexiblevsInflexible/flexinflex.html#capitalism---the-gateway-conspiracy-to-lizard-people",
+ "href": "img/FlexiblevsInflexible/flexinflex.html#capitalism---the-gateway-conspiracy-to-lizard-people",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "3: Capitalism - The Gateway Conspiracy to Lizard People",
+ "text": "3: Capitalism - The Gateway Conspiracy to Lizard People\nNobody suddenly wakes up in the morning, looks in the mirror and says to themselves “Yes, today is the day. Today is the day I start believing in the lizard overlords.” I believe the process is more nuanced than that. Just like the “SayNoToPeerPressue” acting troupe who’s dreams I got to watch die in the comfort of my high school gym, I’m about to push the idea of gateways. From my personal experience, the process of becoming involved in conspiracies looks a little something like this: \nMy point is that ideas that hinge on something already well established in society are easier to swallow than those that aren’t. That is not to say entirely new theories must be wrong, but rather that they are harder for people to immediately understand and they are also more likely to be too out there for the general population to get on board with. I think of parametric and non-parametric models in a very similar way to how people think of capitalism vs lizard people conspiracy theories.\n\nNon-Parametric Models Are Usually More Flexible, But Not Always\nParametric models construct our function by assuming its type, and then estimating the best model within this range. Non-parametric models do not make any assumptions about our model’s form, but rather try to fit to the general shape of the data. Parametric and Non-parametric does not directly translate to flexibility; they both have the potential to produce a very flexible or inflexible fit. For example, a constant polynomial and a K-NN model where K=N would both predict the average response (the most inflexible model we can get). Rather, just like dimensionality, non-parametric models can fall into the same pitfalls as flexibility, and so the limits of our dataset should be kept in mind. By their nature, non-parametric models are more susceptible to variance from changes in the sample, as the sample is the only thing the model is using to make its predictions. Therefore, they are more likely to overfitting than parametric models and are usually more difficult to interpret. These features mean that in general non-parametric models are more flexible, simply by their nature, however they are still have the potential to be inflexible."
},
{
- "objectID": "index.html",
- "href": "index.html",
- "title": "Non-Uniform Monash Business Analytics Team",
- "section": "",
- "text": "Latest News\n\n\nWelcome to Professors Paulo Rodrigues and Professor Thomas Lumley, visiting us until June 2024, and Galit Schmueli visiting May 6-17.\nWelcome to new PhD students Jarryd Chapman, Tina Rashid Jafari 😄 We’re excited to welcome you!\nMonash Master of Business Analytics: Strap on an explorer’s backback and a skeptics hat and learn how to analyse data. Find out more.\nThe real numbats could use your help. If you would like to help with numbat conservation take a look at Project Numbat. You can buy numbat souvenirs or make donations to help with numbat conservation. Go to Project Numbat."
+ "objectID": "img/FlexiblevsInflexible/flexinflex.html#there-are-always-going-to-be-loonies-on-the-internet",
+ "href": "img/FlexiblevsInflexible/flexinflex.html#there-are-always-going-to-be-loonies-on-the-internet",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "4: There are Always Going to Be Loonies on the Internet",
+ "text": "4: There are Always Going to Be Loonies on the Internet\nWe can all spend our entire lives trying to convince everyone on the internet that they are wrong, but at the end of the day, we live in a complicated world, with complicated people, and there are always going to be loonies on the internet. Rather than dreaming of a world where everyone knows everything all the time, the system should just be to manage the chaos. The important life skill to learn isn’t that everyone needs to be corrected, and to focus on the nutters, but rather enjoy the fact that the majority get most things right, most of the time. Socrates might disagree with my idea on majority votes but you win some you lose some.\n\nYou Will Always Have Irreducible Error and It’s Size Matters\nObviously we can never have a perfect prediction since we are working with random variables. We can make our models more flexible to try and account for as much of the error as we can, but if we do, we might end up missing the underlying system entirely. No matter how flexible our model is, we will never have perfection thanks to our irreducible error (an attempt at making one is illustrated below). The interaction between flexibility and irreducible error comes from its size. A large irreducible error means the general shape change more drastically between samples, while a small one means our samples will remain consistent. Just like dimensionality, assumptions about our model, and sample size, this is just something that needs to be kept in mind as it has a strong interaction with the flexibility of our model, and the error from variance."
},
{
- "objectID": "about.html",
- "href": "about.html",
- "title": "Monash NUMBATs",
- "section": "",
- "text": "NUMBATs is the name for our research group, Non-Uniform Monash Business Analytics Team, in the Econometrics and Business Statistics Department. We meet regularly to openly exchange and discuss research ideas and activity in an actively supportive and creative environment. People outside the group are welcome to attend, and can sign up to receive notifications about topics for each meeting, or check the calendar on the web site.\nOur mission is to advance methodology for making data analysis easier. We develop new techniques for statistical modeling and visualisation, and work hard to make these available to the general public by distributing open source software, mostly using R.\nIt is also our goal to raise awareness of Australian fauna and flora, particularly many of the beautiful endangered species. For example, in reality, numbats are insectivores, mostly found in Western Australia, and we encourage you to contribute to the conservation efforts by donating or purchasing products from Project Numbat."
+ "objectID": "img/FlexiblevsInflexible/flexinflex.html#to-conclude",
+ "href": "img/FlexiblevsInflexible/flexinflex.html#to-conclude",
+ "title": "4 Things We Can Learn About Conspiracy Theories and Model Flexibility",
+ "section": "To Conclude",
+ "text": "To Conclude\nDon’t let your mum hang out with weirdos, and treat conspiracy theories and overly complicated models with scepticism."
},
{
- "objectID": "people/visitors/galit.html",
- "href": "people/visitors/galit.html",
- "title": "Galit Shmueli",
+ "objectID": "people/visitors/paulo.html",
+ "href": "people/visitors/paulo.html",
+ "title": "Paulo Canas Rodrigues",
"section": "",
- "text": "galit.shmueli@gmail.com\n https://www.galitshmueli.com\n \n Google scholar\n Interests:\n \n explain or predictstatistical strategybiosurveillanceonline auctionscount data modelsquality control"
- },
- {
- "objectID": "people/index.html#current",
- "href": "people/index.html#current",
- "title": "Meet the team",
- "section": "Current",
- "text": "Current\n \n \n \n \n \n \n \n \n \n \n Alexander Ek\n \n \n Postdoctoral Researcher in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Catherine Forbes\n \n \n Professor\n \n \n \n \n \n \n \n \n \n \n \n \n \n Cynthia Huang\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Damjan Vukcevic\n \n \n Associate Professor and Director of Engagement\n \n \n \n \n \n \n \n \n \n \n \n \n \n David Frazier\n \n \n Professor\n \n \n \n \n \n \n \n \n \n \n \n \n \n David Wu\n \n \n Postdoctoral Research Fellow\n \n \n \n \n \n \n \n \n \n \n \n \n \n Di Cook\n \n \n Professor of Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Didier Nibbering\n \n \n Senior Lecturer in Econometrics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Gael Martin\n \n \n Professor of Econometrics\n \n \n \n \n \n \n \n \n \n \n \n \n \n George Athanasopoulos\n \n \n Professor of Statistics and Head of Department\n \n \n \n \n \n \n \n \n \n \n \n \n \n Harriet Mason\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Janith Wanniarachchi\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Jarryd Chapman\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Jessica Leung\n \n \n Lecturer in Business Analytics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Kate Saunders\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Klaus Ackermann\n \n \n Senior Lecturer in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Michael Lydeamore\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Mitch O’Hara-Wild\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Nuwani Palihawadana\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n P. G. Jayani Lakshika\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Patrick Li\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Rob J Hyndman\n \n \n Professor of Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Ruben Loaiza Maya\n \n \n Senior Lecturer in Econometrics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Swen Kuh\n \n \n Postdoctoral Researcher\n \n \n \n \n \n \n \n \n \n \n \n \n \n Tina Rashid Jafari\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n \n \n \n \n \n \n Xiaoqian Wang\n \n \n Postdoctoral Researcher in Statistics\n \n \n \n \n \n \n \n \n \n \n \n \n \n Yangzhuoran Fin Yang\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n No matching items"
+ "text": "paulocanas@gmail.com\n https://www.paulocanas.org\n \n Google scholar\n Interests:\n \n statistical learninghigh-dimensional dataforecastingtime seriesrobust statisticsdata visualization"
},
{
- "objectID": "people/index.html#visitors",
- "href": "people/index.html#visitors",
- "title": "Meet the team",
- "section": "Visitors",
- "text": "Visitors\n \n \n \n \n \n \n \n \n \n \n Galit Shmueli\n \n \n Professor\n \n \n \n \n \n \n \n \n \n \n \n \n \n Paulo Canas Rodrigues\n \n \n Professor\n \n \n \n \n \n \n \n \n \n \n \n \n \n Thomas Lumley\n \n \n Professor\n \n \n \n \n \n \n \n No matching items"
+ "objectID": "people/visitors/thomas.html",
+ "href": "people/visitors/thomas.html",
+ "title": "Thomas Lumley",
+ "section": "",
+ "text": "t.lumley@auckland.ac.nz\n https://profiles.auckland.ac.nz/t-lumley\n \n Google scholar\n Interests:\n \n bioinformaticsmedical statisticsdesign of medical trialsstatistical computingsurvey statistics"
},
{
- "objectID": "people/index.html#alumni",
- "href": "people/index.html#alumni",
- "title": "Meet the team",
- "section": "Alumni",
- "text": "Alumni\n \n \n \n \n \n \n \n \n \n \n \n \n \n Title\n \n \n Description\n \n \n \n \n \n \n \n \n \n Anastasios Panagiotelis\n \n \n Associate Professor\n \n \n \n \n \n \n \n Cameron Roach\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n Dan Simpson\n \n \n Professor of Analytics Engagement\n \n \n \n \n \n \n \n Earo Wang\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n Emi Tanaka\n \n \n Senior Lecturer in Statistics\n \n \n \n \n \n \n \n Fan Cheng\n \n \n Data Scientist\n \n \n \n \n \n \n \n Lauren Kennedy\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n Luis Torres\n \n \n Postdoctoral Researcher (Statistics)\n \n \n \n \n \n \n \n Mahdi Abolghasemi\n \n \n Post Doctoral Researcher (Data Science)\n \n \n \n \n \n \n \n Nathaniel Tomasetti\n \n \n Data Scientist\n \n \n \n \n \n \n \n Nicholas Spyrison\n \n \n PhD (Information Technology)\n \n \n \n \n \n \n \n Nick Tierney\n \n \n Research Software Engineer\n \n \n \n \n \n \n \n Pablo Montero Manso\n \n \n Post Doctoral Researcher (Statistics)\n \n \n \n \n \n \n \n Patricia Menendez\n \n \n Senior Lecturer in Statistics\n \n \n \n \n \n \n \n Priyanga Dilini Talagala\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n Puwasala Gamakumara\n \n \n Post Doctoral Researcher (Statistics)\n \n \n \n \n \n \n \n Ryan Thompson\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n Sayani Gupta\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n Sevvandi Kandanaarachchi\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n Sherry Zhang\n \n \n PhD (Statistics)\n \n \n \n \n \n \n \n Stephanie Kobakian\n \n \n Data Scientist\n \n \n \n \n \n \n \n Stuart Lee\n \n \n Postdoctoral Researcher (Statistics)\n \n \n \n \n \n \n \n Thiyanga Talagala\n \n \n Lecturer in Statistics\n \n \n \n \n \n \n \n Ursula Laa\n \n \n Lecturer (Statistics)\n \n \n \n \n \n No matching items"
+ "objectID": "people/current/palihawadana-nuwani/index.html",
+ "href": "people/current/palihawadana-nuwani/index.html",
+ "title": "Nuwani Palihawadana",
+ "section": "",
+ "text": "nuwani.kodikarapalihawadana@monash.edu\n @nuwani-palihawadana\n Interests:\n \n forecastingtime series analysispredictive modelling"
},
{
- "objectID": "people/current/fui-swen-kuh/index.html",
- "href": "people/current/fui-swen-kuh/index.html",
- "title": "Swen Kuh",
+ "objectID": "people/current/lydeamore-michael/index.html",
+ "href": "people/current/lydeamore-michael/index.html",
+ "title": "Michael Lydeamore",
"section": "",
- "text": "swen.kuh@monash.edu\n @swenkuh\n @swenk238\n Interests:\n \n Social scienceHierarchical modellingBayesian inference"
+ "text": "michael.lydeamore@monash.edu\n https://research.monash.edu/en/persons/michael-lydeamore\n @MikeLydeamore\n @MikeLydeamore\n Interests:\n \n computational statisticsinfectious diseases modelling and epidemiologydata science"
},
{
- "objectID": "people/current/hyndman-rob-j/index.html",
- "href": "people/current/hyndman-rob-j/index.html",
- "title": "Rob J Hyndman",
+ "objectID": "people/current/ohara-wild-mitch/index.html",
+ "href": "people/current/ohara-wild-mitch/index.html",
+ "title": "Mitch O’Hara-Wild",
"section": "",
- "text": "Rob.Hyndman@monash.edu\n https://robjhyndman.com/\n @robjhyndman\n @robjhyndman\n \n Google scholar\n Interests:\n \n forecastingtime seriesexploratory data analysisanomaly detection"
+ "text": "mail@mitchelloharawild.com\n https://www.mitchelloharawild.com/\n @mitchoharawild\n @mitchelloharawild\n Interests:\n \n statistical computingtime seriesforecastingsoftware design"
},
{
- "objectID": "people/current/vukcevic-damjan/index.html",
- "href": "people/current/vukcevic-damjan/index.html",
- "title": "Damjan Vukcevic",
+ "objectID": "people/current/jewson-jack/index.html",
+ "href": "people/current/jewson-jack/index.html",
+ "title": "Jack Jewson",
"section": "",
- "text": "damjan.vukcevic@monash.edu\n http://damjan.vukcevic.net/\n @VukcevicD\n @dvukcevic\n \n Google scholar\n \n ORCiD\n \n LinkedIn\n Interests:\n \n applied statisticsbayesian inferencedata scienceelection auditingsequential analysisstatistical genomics"
+ "text": "jack.jewson@monash.edu\n @jejewson\n Interests:\n \n bayesian inferencerobust inferencevariable selectionstructural learningdifferential privacy"
},
{
"objectID": "people/current/wang-xiaoqian/index.html",
diff --git a/people/current/jewson-jack/avatar.png b/people/current/jewson-jack/avatar.png
new file mode 100644
index 0000000..985afdf
Binary files /dev/null and b/people/current/jewson-jack/avatar.png differ
diff --git a/people/current/jewson-jack/index.qmd b/people/current/jewson-jack/index.qmd
new file mode 100644
index 0000000..5b12168
--- /dev/null
+++ b/people/current/jewson-jack/index.qmd
@@ -0,0 +1,16 @@
+---
+email: jack.jewson@monash.edu
+title: Jack Jewson
+description: Senior Lecturer in Statistics
+github: jejewson
+interests:
+- bayesian inference
+- robust inference
+- variable selection
+- structural learning
+- differential privacy
+user_groups:
+- Faculty
+- Current
+image: avatar.png
+---