################################################################################ # # # Multi-dimensional visualizations in loon # # ################################################################################ # # # # # Separate data sets will be used to illustrate different methods # Methods are # 1. Pairs plots # 2. Serial axes plots # 3. Navigation plots # # # ################################################################################ # # # Method 1: Pairs plots or Scatterplot matrices # l_pairs() # ################################################################################ # library(loon) # # Here we will use the R data set "swiss" being # # "Swiss Fertility and Socioeconomic Indicators (1888) Data" # # and consisting of # # "Standardized fertility measure and socio-economic indicators # for each of 47 French-speaking provinces of Switzerland at about 1888." # # It has 6 socio-economic variables names(swiss) # "Fertility Ig, ‘common standardized fertility measure’ # Agriculture % of males involved in agriculture as occupation # Examination % draftees receiving highest mark on army examination # Education % education beyond primary school for draftees. # Catholic % ‘catholic’ (as opposed to ‘protestant’). # Infant.Mortality live births who live less than 1 year # # All variables but ‘Fertility’ give proportions of the population." # Examination and Education are averages for 1887, 1888 and 1889. # # The (French speaking) province names available as the row.names(swiss) # # Scatter plot matrices in loon are created with the function l_pairs() pswiss <- l_pairs(swiss, title = "Swiss 1888 data", itemLabel = row.names(swiss), showHistograms = TRUE, showItemLabels = TRUE, linkingGroup = "Swiss") # # Note: # - No title appears. # - The inspector focuses on the plots individually. # # TRY # # 1. Identifying the province with highest education levels and lowest fertility. # # 2. Scrolling on the square identified as "Agriculture" versus "Education" # What happens? # # Scroll and hold . # Scroll and hold (or on a Mac) # # Scale to plot on this plot. # # 3. On the "Education" versus "Catholic" scatterplot # Select the group in the bottom right corner # (high Catholic, low Education) # Rescale this plot to selected. # # Rescale it plot. # Deselect the points. # # 4. Click on any scatterplot. # In the INSPECTOR for that plot: # - which boxes may be checked or unchecked. # - Why would these constraints make sense? # # 5. Click on any histogram. # In the INSPECTOR for that histogram: # - which boxes may be checked or unchecked. # - Why would these constraints make sense? # - How can you move the bin handle? # # # # The data structure # str(pswiss) class(pswiss) # Individual plots may be accessed names(pswiss) # For example class(pswiss$x3y1) # Top row third column ...histogram pswiss$x3y1["xlabel"] # of Examination # Et cetera class(pswiss$x3y2) class(pswiss$x6y2) class(pswiss$x6y2) # # The linking enforces the following lowEd <- swiss$Education < 20 pswiss$x3y2["color"] <- lowEd pswiss$x3y2["size"] <- 10 # # The following is information on the layout. l_getLocations(pswiss) # # And all plots can be accessed by this generic function l_getPlots(pswiss) # # E.g. Half all the binwidths of histograms and rescale them for (h in l_getPlots(pswiss)) { if ("l_hist" %in% class(h)) { h["binwidth"] <- h["binwidth"] / 2 l_scaleto_plot(h) } } # # # There appears to be a small bug in the following # (which should NOT be changing the appearence of the l_pairs) # plot(pswiss) # # See help(l_pairs) for more options. # # ################################################################################ # # # Method 2: Parallel coordinate and Radial axes plots # l_serialaxes() # ################################################################################ # library(loon) # # For simplicity, use the famous Iris data # containing four measurements on 150 iris flowers from each of three species. # nFlowers <- nrow(iris) nFlowers nVars <- ncol(iris) - 1 # Not counting Species. # # Mix up the labels, so we have no clue of the species # from the label (which would be row number - 1 otherwise) # randomLabels <- paste("iris:", sample(1:nFlowers, size = nFlowers, replace = FALSE)) # # Each of the four variables is mapped to a single parallel axis # (instead of orthogonal) # and the axes are laid out in parallel. # # The minimum value of each variable (across all flowers/rows) # is mapped to 0 and the maximum to 1. # This is called "variable" scaling here and is the default. sa_iris <- l_serialaxes(iris[,-5], axesLayout = "parallel", linewidth = 2, color = "slategrey", title = "Iris data", itemLabel = randomLabels, showItemLabels = TRUE, linkingGroup = "iris" ) # # Note: # # - the inspector has changed # # - Each flower/observation/row has become a "curve" # # - Criss-cross lines between axes suggest negative correlation between variables # (low values on one axis map to high values on the other) # # - Parallel or converging lines (past one of the axes) suggest positive correlation # (low values on one axis map to low values on the other) # # - The tighter the criss-crossing (e.g. a single point of intersection) the # closer to negative one is the correlation. # # - The more parallel (or consistently ordered) the greater the positive correlation. # # - Each axis ranges from 0 (at the bottom) to 1 (at the top) # # # TRY: # # 1. Use item labels (hover over a line) to identify individuals # # 2. Select lines individually or several by sweeping a line over them. # # 3. Identify three different groups by colour. # - Deactivate any observations you want to move "out of the way" # # 4. Play around with the "show" checkboxes in the inspector # # 5. Change the axes layout to radial # # 6. Change the scaling (maps values to 0-1 axes) # # - variable: minimum value of each variable/column is mapped to 0 # maximum value of each variable/column is mapped to 1 # across all rows # # - observation: # minimum value of each observation/row is mapped to 0 # maximum value of each observation/row is mapped to 1 # across all columns # # - data: # minimum value of all the data is mapped to 0 # maximum value of all the data is mapped to 1 # across all entries (every row and column) # # - none: # no mapping is done # data values are plotted unchanged # data values may not be in the range of the axes # allows user to construct any mapping they want by first transforming data # # # ######## # # NOTE: Variable order problem # # - not every pair of variables appear beside one another # - this may affect interpretation # - an R package called "PairViz" has been built to address this problem # for any pairwise visualization. # - Here we use only one of the PairViz functions (eulerian()) to get # all pairs to appear in the plot. # library(PairViz) ord <- eulerian(nVars) ord sa_allpairs <- l_serialaxes(iris[,ord], axesLayout = "parallel", linewidth = 2, color = "slategrey", title = "Iris data (all pairs)", itemLabel = randomLabels, showItemLabels = TRUE, linkingGroup = "iris" ) ######## # # EXPLORE # # Use either (or both) of the above displays to see if you # can identify three groups of flowers. (i.e. visual clustering) # # When you have identified these by assigning different colours # you can check how well you did as follows. # cluster <- as.factor(sa_allpairs["color"]) levels(cluster) <- LETTERS[1:length(levels(cluster))] # # Your clusters now have been assigned letters, which # can be compared to the true species of each flower # Species <- iris$Species # # as fillows. classes <- data.frame(cluster = cluster, species = Species) xtabs(data = classes) # # If you want to see the true species coloured in the plots sa_allpairs["color"] <- Species # # EXPLORE: # A more fun dataset to try your hand at would be "oliveAcids" # from the loon package. data("oliveAcids") # # ################################################################################ # # # Method 3: Navigation plots # l_navgraph() # ################################################################################ # library(loon) # # Here, we will use a dynamic interactive graphic on a data set having # eight dimensions. # # This is the oliveAcids data (the measurements only from the olive data) # available in loon. # # See help("oliveAcids") or, better, help("olive") for a complete description. # # data("oliveAcids", package = "loon") data("olive", package = "loon") # # # olive_nav <- l_navgraph(oliveAcids, linkingGroup = "olive") # # Note: # # - two windows appear: # one is a navigation graph window, # one is a scatterplot # # - the navigation graph (navgraph) has # - pairs of variables as nodes # - edges connecting nodes # - a large orange circle called the "navigator" # - some display choices attached at the right # # # - the scatterplot shows # - the observations for a pair of variables # - the variables are identified with the # location of the navigator in the navgraph # # - the navigation graph has its own inspector (one extra check box) # # - the display choices change the navgraph # - 3d transition has edges between nodes # that share a variable # (edges involve three variables) # # - 3d transition has edges between nodes # that share a variable # (edges involve three variables) # # TRY # # 1. Click on the orange navigator in the navgraph # # 2. Hold down the shift key to see connected nodes # # 3. Holding the shift key select a connected node # Continue in this way until you have selected several # # 4. Scroll your mouse. # # 5. Drag the navigator # # 6. Switch between transition graph types and repeat. # # EXPLORE # # Try to identify (by colouring) as many groups as you can in # this data set. (i.e. visual clustering) # # When you have identified all you want, you can compare these # to the growing areas of these olive oils. # # You can extract the features of the scatterplot # as follows # olive_plot <- olive_nav$plot # # Get your clusters (identified with colours) cluster <- as.factor(olive_plot["color"]) levels(cluster) <- LETTERS[1:length(levels(cluster))] # # The growing areas (or real clusters) growing_areas <- olive$Area # # See how well you did. classes <- data.frame(cluster = cluster, areas = growing_areas) xtabs(data = classes) # # You might want to explore the data structure # now knowing the actual growing areas: olive_plot["color"] <- growing_areas # # Watch how the colours "travel" together. # # See the following for more examples # demo("l_ng_images_frey_LLE") demo("l_ng_images_faces") demo("l_ng_dimred") # # And much more on navigation graphs and related tools # for exploring high dimensional data at l_help("learn_R_display_graph.html") # # ################################################################################