- Overview of base R capabilitites.
- If serious about character strings manipulations, take a look at the
stringr
package which offers a more coherent interface. - Section largely based on:
Sept. 7-11, 2015
stringr
package which offers a more coherent interface.mean == "mean" ## Error in mean == "mean": comparison (1) is possible only for atomic and list types class(mean) ## [1] "function" class("mean") ## [1] "character" "The 'R' project for statistical computing" != 'The "R" project for statistical computing' ## [1] TRUE
a <- "" # empty string b <- character(0) # empty character vector
month.name[1:4] ## [1] "January" "February" "March" "April" length(month.name) ## [1] 12 nchar(month.name) ## [1] 7 8 5 5 3 4 4 6 9 7 8 8
chartr("u", "o", "union") # One characer is substituted ## [1] "onion" chartr(old = "ao", new = "A0", "This is a boring string") # Several characters can be is substituted ## [1] "This is A b0ring string" crazy = c("Here's to the crazy ones", "The misfits", "The rebels") # It is vectorized chartr("aei", "#!?", crazy) ## [1] "H!r!'s to th! cr#zy on!s" "Th! m?sf?ts" ## [3] "Th! r!b!ls" chartr("a-c", "D-F", letters[1:8]) # One can use ranges of characters ## [1] "D" "E" "F" "d" "e" "f" "g" "h"
This is nice but regular expressions are far more powerfull…
casefold("aLL ChaRacterS in LoweR caSe") ## [1] "all characters in lower case" casefold("All ChaRacterS in Upper Case", upper = TRUE) ## [1] "ALL CHARACTERS IN UPPER CASE"
Note that tolower()
and toupper()
do as their name imply.
For combining text and variable values and have some control on the way the values are formatted.
paste("la", "maison", sep = " ") # basic concatenation, separator is a space ## [1] "la maison" # Vectorized concatenation. Note that numeric vectors are converted to character. paste("Section", 1:3, sep = "-") ## [1] "Section-1" "Section-2" "Section-3" paste("Section", 1:3, LETTERS[1:4], sep = "-") # Rules of recycling apply ## [1] "Section-1-A" "Section-2-B" "Section-3-C" "Section-1-D" paste("Section", 1:3, LETTERS[1:4], sep = "-", collapse = " & ") # Collapse everything to a single string ## [1] "Section-1-A & Section-2-B & Section-3-C & Section-1-D" paste("Section", rep(1:3, each = 5), 1:5, sep = "-") # What is this doing!? ## [1] "Section-1-1" "Section-1-2" "Section-1-3" "Section-1-4" "Section-1-5" ## [6] "Section-2-1" "Section-2-2" "Section-2-3" "Section-2-4" "Section-2-5" ## [11] "Section-3-1" "Section-3-2" "Section-3-3" "Section-3-4" "Section-3-5"
sprintf()
is extremely convenient but a bit complex too.sprintf(fmt = "The number pi = %f", pi) ## [1] "The number pi = 3.141593"
The argument fmt is a character vector mixing plain text with format strings. The % symbol terminated by a letter acts as a positional and convertion flag for insertion of the formated value of the other argument(s).
sprintf("The number pi = %E", pi) # scientific notation ## [1] "The number pi = 3.141593E+00" sprintf("The number pi = %.2f %s", pi, c(". Yes ?", "or not?")) # specify precision. concatenate with text ## [1] "The number pi = 3.14 . Yes ?" "The number pi = 3.14 or not?" sprintf("This is %-8s justification of text", "LEFT") # Also fixing field width to 8 characters. ## [1] "This is LEFT justification of text" sprintf("This is %8s justification of text", "RIGHT") ## [1] "This is RIGHT justification of text"
diceResult <- 4 cat("The dice result is: ", diceResult, "!\nTry again.", sep = "") ## The dice result is: 4! ## Try again. cat("Long strings can", "be displayed over", "several lines using", "the fill= argument.", fill = 40) ## Long strings can be displayed over ## several lines using the fill= argument.
By default cat()
prints to the stout (screen) cat()
returns no value (invisible NULL)
Use message()
, warning()
, stop()
as appropriate. These functions are part of the R condition system (?conditions
), a mechanism for signaling and handling anomalous or exceptional events occuring during the execution of your code.
howRU <- function(x) { message("Will first check the type of the argument.") if (!is.logical(x)) {stop("Argument must be either 'TRUE' or 'FALSE'.\n")} # asserting argument type if (x) cat("I am fine, thank you!\n") else warning("I don't feel so good.\n") # issue a warning if feel sick cat("Good bye") } howRU("bof") # Trows an error: execution alted ## Will first check the type of the argument. ## Error in howRU("bof"): Argument must be either 'TRUE' or 'FALSE'. howRU(TRUE) # prints something to stdout ## Will first check the type of the argument. ## I am fine, thank you! ## Good bye howRU(FALSE) # warns you that something is unusual ## Will first check the type of the argument. ## Warning in howRU(FALSE): I don't feel so good. ## Good bye
substring()
# extract based on a range of positions substring(text = "abcdef", first = 2, last = 4) ## [1] "bcd" # vectorized extraction substring("abcdef", first = 1:4, last = 4:5) # the 'x' and 'last' vectors are recycled ## [1] "abcd" "bcde" "cd" "de" # replacing portions of strings with the assignment operator x = c("may", "the", "force", "be", "with", "you") substring(x, 2, 2) <- "#" x ## [1] "m#y" "t#e" "f#rce" "b#" "w#th" "y#u" # everything is a vector... s <- c("more", "emotions", "are", "better", "than", "less") substring(s, 1:3, 2:4) <- c(" ", "zzz") s ## [1] " ore" "ezztions" "ar " "zztter" "t an" "lezz"
Can you insert a string longer than the range provided in the first
and last
arguments?
# What strings are email adresses? s <- c("object@attribute", "Rstudio", maintainer("base")) # The litteral approach grep(pattern = "@", x = s, value = TRUE) ## [1] "object@attribute" ## [2] "R Core Team <R-core@r-project.org>" # Accurate with a more specifc pattern myPat <- "([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})" grep(pattern = myPat, x = s, value = TRUE) ## [1] "R Core Team <R-core@r-project.org>"
s <- c("tobacco pipe (hazardous!)", "pile of junk", "directory of files") grepl("pi", s) ## [1] TRUE TRUE FALSE
grep("(", s, value = TRUE) # parentheses create capturing groups ## Error in grep("(", s, value = TRUE): invalid regular expression '(', reason 'Missing ')'' grep("\\(", s, value = TRUE) # escape special meanning with DOUBLE backslash ## [1] "tobacco pipe (hazardous!)"
regexpr()
, gregexpr()
or regexec()
.m <- regexpr(pattern = "pi[pl]e", text = s) # brakets define a character set m ## [1] 9 1 -1 ## attr(,"match.length") ## [1] 4 4 -1 ## attr(,"useBytes") ## [1] TRUE regmatches(x = s, m = m, invert = FALSE) ## [1] "pipe" "pile"
Note that similar to sub
functions below, regmatches()
can do substitution tasks via the assignemnt operator.
sub(pattern = "pi[pl]e", replacement = "cigarette", x = s) ## [1] "tobacco cigarette (hazardous!)" "cigarette of junk" ## [3] "directory of files" gsub("\\d", "_", "1789, the revolution") # global substitution of digits ## [1] "____, the revolution"
# Often used to get the words of a sentence s <- c("Killian! I'll be back!", "You cannot teach a man anything; you can only help him find it within himself – Galileo") words <- strsplit(x = s, split = " ") words # the returned object is a list of vectors ## [[1]] ## [1] "Killian!" "I'll" "be" "back!" ## ## [[2]] ## [1] "You" "cannot" "teach" "a" "man" ## [6] "anything;" "you" "can" "only" "help" ## [11] "him" "find" "it" "within" "himself" ## [16] "–" "Galileo" # It can be flattened unlist(words)[1:10] ## [1] "Killian!" "I'll" "be" "back!" "You" ## [6] "cannot" "teach" "a" "man" "anything;" # It can be used in a `apply` construct sapply(words, length) # number of words in the sentense ## [1] 4 17
strsplit()
with sep = "" is used to break a string into a vector of single characters. Try it!
You have 12 samples and want to create dummy labels like "S_1". How would you do that with paste()
? Assign your resulting character vector to the name "lab1"
Can you do it with sprintf()
so that the numeric field is of width 2 with padding with zeros like "S_03"? Tip: look for "0" in the sprintf()
help page. Assign your resulting character vector to the name "lab2"
Sort lab1 and lab2. What format is the most convenient?
cat(lab1 <- paste("S", 1:12, sep = "_")) ## S_1 S_2 S_3 S_4 S_5 S_6 S_7 S_8 S_9 S_10 S_11 S_12 cat(lab2 <- sprintf("S_%02i", 1:12)) ## S_01 S_02 S_03 S_04 S_05 S_06 S_07 S_08 S_09 S_10 S_11 S_12 sort(lab1) ## [1] "S_1" "S_10" "S_11" "S_12" "S_2" "S_3" "S_4" "S_5" "S_6" "S_7" ## [11] "S_8" "S_9" sort(lab2) ## [1] "S_01" "S_02" "S_03" "S_04" "S_05" "S_06" "S_07" "S_08" "S_09" "S_10" ## [11] "S_11" "S_12"
How would you find the position(s) at which the letter s occurs in saucisson sec? As usual with R, there are several ways to do that…
s <- "saucisson sec" ## 1 ## The brutal way {.build} # Break down string to a vector of single characters vectOfSingleChar <- substring(s, 1:nchar(s), 1:nchar(s)) # First solution vectOfSingleChar <- unlist(strsplit(s, split = "")) # Second, more 'elegant' solution # Find indexes of vector elements corresponding to "s" which(vectOfSingleChar == "s") ## [1] 1 6 7 11 ## 2 ## Using regular expressions {.build} gregexpr("s", s) ## [[1]] ## [1] 1 6 7 11 ## attr(,"match.length") ## [1] 1 1 1 1 ## attr(,"useBytes") ## [1] TRUE
Bottom line: learn regular expressions… Also, when you want to accomplish a task, think twice about the approach, some are easier to implement, more elegant and more efficient than others.
How would you capitalize the first letter of each word of a sentence? Create a function for that.
simpleCap <- function(s) { spls <- strsplit(s, " ")[[1]] paste(toupper(substring(spls, 1, 1)), substring(spls, 2), sep = "", collapse = " ") } simpleCap("This is a sentence.") ## [1] "This Is A Sentence."
You are running a full factorial experiment on subjects treated with a drug. Here are the factors involved and their levels:
dose <- seq(60, 80, 10) exposureTime <- c(100, 200) sex <- c("Male","Female")
You want to create treatment labels corresponding to combinations of the levels of the factors. How would you do that? Tip: use expand.grid()
mdf <- expand.grid(dose, exposureTime, sex) apply(X = mdf, MARGIN = 1, FUN = paste, collapse = "-") ## [1] "60-100-Male" "70-100-Male" "80-100-Male" "60-200-Male" ## [5] "70-200-Male" "80-200-Male" "60-100-Female" "70-100-Female" ## [9] "80-100-Female" "60-200-Female" "70-200-Female" "80-200-Female"
Read the file englishWords.txt in R. It contains a compedium of the words in the English language.
What is the longest word? What is the median word length in English? How many words end with "tion"?
f <- "/media/cunnac/DONNEES/CUNNAC/Lab-Related/Communications/Teaching/R_trainning_module/filesToBringToTrainning/englishWords.txt" words <- readLines(f) ## Warning in readLines(f): incomplete final line found on '/media/cunnac/ ## DONNEES/CUNNAC/Lab-Related/Communications/Teaching/R_trainning_module/ ## filesToBringToTrainning/englishWords.txt' wl <- nchar(words) max(wl) ## [1] 31 words[wl == max(wl)] # longest word ## [1] "dichlorodiphenyltrichloroethane" mwl <- median(wl) # median of word length mwl ## [1] 9 tionWords <- grep("tion$", words, value = TRUE) # words ending with "tion" tionWords[1:10] ## [1] "abacination" "abaction" "abalation" "abalienation" ## [5] "abannition" "abarticulation" "abbreviation" "abdication" ## [9] "abduction" "aberration" length(tionWords) ## [1] 7180
Generate 10E6 random five letter words. Tip use the function replicate()
. Do you have any of them that turns out to be genuine English? What are they? How would you run that a decent number of times to get a chance to have a match?
# First have a sense of the distribution of letters in "real" words: letterDistris <- lapply(strsplit(words, ""), FUN = function(x) { x <- factor(x, levels = letters) as.matrix(table(x)) } ) letterFreq <- prop.table(table(factor(unlist(strsplit(paste(words, collapse = ""), "")), levels = letters))) numbOfSampling <- 1E6 replicate(3, { randWords <- replicate(numbOfSampling, paste(sample(letters, mwl, replace = TRUE, prob = letterFreq), collapse = "") ) intersect(randWords, words) } )