Révision | 2fe3c54f9a1835f5709d960997254d6e668aa906 |
---|---|
Taille | 185,252 octets |
l'heure | 2025-01-09 18:33:11 |
Auteur | Lorenzo Isella |
Message de Log | I simply replaced magritte with the native pipe. |
## function to create a log binning
log_bin <- function(x, n_bin){
x_max <- max(x)
x_min <- min(x)
m <- n_bin-1
r <- (x_max/x_min)^(1/m)
my_seq <- seq(0,m,by=1)
grid <- x_min*r^my_seq
return(grid)
}
log_binning_3 <- function(x_min,x_max,epsi,n_bin){
#epsi <- 0.001
x_max <- x_max+epsi
m <- n_bin-1
r <- (x_max/x_min)^(1/m)
my_seq <- seq(0,m,by=1)
grid <- x_min*r^my_seq
}
log_binning <- function(x_min,x_max,n_bin){
x_max <- x_max
m <- n_bin-1
r <- (x_max/x_min)^(1/m)
my_seq <- seq(0,m,by=1)
grid <- x_min*r^my_seq
}
find_middle_point <- function(x, log_trans=F){
nn <- length(x)
if (log_trans==F){
x_ini <- x[1:nn-1]
x_fin <- x[2:nn]
x_middle <- (x_ini+x_fin)/2
}
else{
x <- log(x)
x_ini <- x[1:nn-1]
x_fin <- x[2:nn]
x_middle <- (x_ini+x_fin)/2
x_middle <- exp(x_middle)
}
return(x_middle)
}
## A function to pad columns of unequal length with NAs.
## pad <- function(x, n) { # ugly function that doesn't keep the attributes of x
## len.diff <- n - length(x)
## c(rep(NA, len.diff), x)
## }
pad <- function(x, n, pattern=NA, before=T) { # ugly function that doesn't keep the attributes of x
len.diff <- n - length(x)
if (before==T){
res <- c(rep(pattern, len.diff), x) }
else{
res <- c( x.rep(pattern, len.diff))
}
return(res)
}
pad2 <- function(x1, y1, pattern=NA, before=T) { # ugly function that doesn't keep the attributes of x
if (length(x1)<length(y1)){
x <- x1
y <- y1
} else {
x <- y1
y <- x1
}
n <- length(y)
len.diff <- n - length(x)
if (before==T){
res <- c(rep(pattern, len.diff), x) }
else{
res <- c( x,rep(pattern, len.diff))
}
return(res)
}
## a function to read all the csv files in a director and make
## them a tibble
read_all_csv <- function( mypath=".", ...){
f <- list.files(
mypath,
pattern = "*.csv",
full.names = TRUE)
d <- purrr::map_df(f, readr::read_csv, .id = "id", ...)
return(d)
}
read_all_excel <- function( mypath=".", pattern="*xlsx", ...){
f <- list.files(
mypath,
pattern,
full.names = TRUE)
d <- purrr::map_df(f, ## readxl::
read_excel, .id = "id", ...)
return(d)
}
read_all_excel_to_char <- function( mypath=".", pattern="*xlsx", ...){
f <- list.files(
mypath,
pattern,
full.names = TRUE)
d <- purrr::map_df(f, read_excel_to_char, .id = "id", ...)
return(d)
}
read_all_csv_to_char <- function( mypath=".", pattern="*csv", ...){
f <- list.files(
mypath,
pattern,
full.names = TRUE)
d <- purrr::map_df(f, read_csv_to_char, .id = "id", ...)
return(d)
}
## same but for the tsv files
read_all_tsv <- function( mypath=".", ...){
f <- list.files(
mypath,
pattern = "*.tsv",
full.names = TRUE)
d <- purrr::map_df(f, readr::read_tsv, .id = "id", ...)
return(d)
}
read_tsv_to_char <- function(x, ...){
res <- read_tsv(x, ...) |>
as_tibble() |>
all_to_char()
return(res)
}
read_all_tsv_to_char <- function( mypath=".", pattern="*tsv", ...){
f <- list.files(
mypath,
pattern,
full.names = TRUE)
d <- purrr::map_df(f, read_tsv_to_char, .id = "id", ...)
return(d)
}
## same but for any files
read_all_extensions <- function( extension, mypath=".", ...){
f <- list.files(
mypath,
pattern = paste("*", extension, sep=""),
full.names = TRUE)
d <- purrr::map_df(f, readr::read_delim, .id = "id", ...)
return(d)
}
## see https://kutt.it/gmtsnx for the compound annualized growth rate
cagr <- function(x_ini, x_fin, n){
res <- (x_fin/x_ini)^(1/n)-1
return(res)
}
## a function to calculate the composite return (or growth) rate
composite_rate <- function(myvec){
myvec <- myvec[!is.na(myvec)]
res <- prod(1+myvec)-1
n <- length(myvec)
res <- (1+res)^(1/n)-1
return(res)
}
# a function to retrieve estat data
estat_retrieval <- function(dataset_query){
res <- getTimeSeriesTable('EUROSTAT', dataset_query
) |> as_tibble()
return(res)
}
## see https://frama.link/9rtXmV-Y
##An NSE function to calculate the yearly growth rates
## calc_growth <- function(df, old_column, new_column){
## old_column = enquo(old_column)
## ## This is how you define/modify the name of the new column that
## ## mutate wll create
## new_column = as.character(substitute(new_column))
## df %>%
## mutate(use = !!old_column,
## !!new_column := c(NA, head(diff(c(use, NA))/use, -1)),
## use=NULL
## ) ## %>%
## ## mutate()
## }
#### The function above does not work any longer. I write below an updated version
calc_growth <- function(df, old_col, new_col) {
res <-df |>
mutate({{new_col}} := c(NA,(tail({{old_col}}, -1)
-head({{old_col}}, -1))/head({{old_col}}, -1))
)
return(res)
}
calc_returns <- function(x, log_ret=F) {
if (log_ret==F){
res <- c(NA,(tail(x, -1)
-head(x , -1))/head(x, -1))
} else {
res <- c(NA,log(tail(x, -1)/
head(x, -1)))
}
return(res)
}
## See https://purrple.cat/blog/2018/03/02/multiple-lags-with-tidy-evaluation/
mylags <- function(data, variable, n=10){
variable <- enquo(variable)
indices <- seq_len(n)
quosures <- map( indices, ~quo(lag(!!variable, !!.x)) ) |>
set_names(sprintf("lag_%s_%02d", quo_text(variable), indices))
mutate( data, !!!quosures )
}
linMap <- function(x, from, to)
(x - min(x)) / max(x - min(x)) * (to - from) + from
## function using NSE to clean the special characters out of a string column
clean_hex <- function(df, new_column, old_column){
old_column <- enquo(old_column)
new_column <- as.character(substitute(new_column))
df |> mutate(use = !!old_column,
!!new_column := gsub("[^[:alnum:]///' ]", "", use),
use=NULL
)
}
## remove_special_char <- function(x, new_pattern=""){
## ## remove special characters from a column
## a <- str_replace_all(x, "[[:punct:]]", new_pattern)
## res <- str_replace_all(a, "[^[:alnum:]]", new_pattern)
## res <- str_trim(res, side ="both")
## return(res)
## }
remove_special_char <- function(x, new_pattern=""){
## remove special characters from a column
res <- x |>
str_replace_all( "[[:punct:]]", new_pattern) |>
str_replace_all( "[^[:alnum:]]", new_pattern) |>
str_trim( side ="both")
return(res)
}
remove_special_char_num <- function(x, new_pattern=""){
## remove special characters from a column
res <- x |>
str_replace_all( "[[:punct:]]", new_pattern) |>
str_replace_all( "[^[:alnum:]]", new_pattern) |>
str_replace_all( "[:digit:]", new_pattern) |>
str_trim( side ="both")
return(res)
}
## See https://stackoverflow.com/questions/24173194/remove-parentheses-and-text-within-from-strings-in-r/24174655#24174655
remove_text_within_brackets <- function(x, new_pattern=""){
res <- str_replace_all(x, " \\s*\\([^\\)]+\\)", new_pattern)
return(res)
}
remove_all_special_char <- function(df, new_pattern="" ){
## res <- df |>
## mutate(across(where(is.character), ~remove_special_char(.x, new_pattern)))
res <- df |>
mutate(across(where(is.character), \(x) remove_special_char(x, new_pattern)))
return(res)
}
## see https://www.opencase.in/text-cleaning-in-r/
remove_miscoded_char <- function(x, new_pattern=""){
res <- gsub("[^\x01-\x7F]", new_pattern, x)
return(res)
}
## see https://frama.link/9rtXmV-Y
sort_keep <- function(df,old_column, new_column, levels_to_keep){
old_column = enquo(old_column)
## This is how you define/modify the name of the new column that
## mutate wll create
new_column = as.character(substitute(new_column))
df |>
arrange(desc(!!old_column)) |>
mutate(use = !!old_column,
!!new_column := factor(c(use[1:levels_to_keep],
rep("Other", n() - levels_to_keep)),
levels = c(use[1:levels_to_keep], "Other")) ,
use=NULL
)
## In the mutate statement, first you define a copy of the old_column, which you call "use". You do all the work on the new_column by modifying "use". Finally ("use=NULL") you remove the "use" column as you do not want to see it any longer in the results
}
## see http://r.789695.n4.nabble.com/scales-percent-precision-td4685948.html
#A function to add percentages to ggplot2 axis without any side effects
mypercent<-function(theargument, siglevel=2) {
stopifnot(is.numeric(theargument))
paste(signif(theargument, siglevel) * 100, "%", sep="")
}
mypercent_format<-function(theargument, n, my_sep="") {
stopifnot(is.numeric(theargument))
x <- format(round(theargument*100, n), nsmall = n, big.mark= my_sep, trim=TRUE )
res <- paste(x, "%", sep="")
return(res)
}
mypercentlatex_format<-function(theargument, n, my_sep="") {
stopifnot(is.numeric(theargument))
x <- format(round(theargument*100, n), nsmall = n, big.mark= my_sep, trim=TRUE )
res <- paste(x, "\\%", sep="")
return(res)
}
## to use proper labels in latex
mypercentlatex<-function(theargument, siglevel=2) {
stopifnot(is.numeric(theargument))
paste(signif(theargument, siglevel) * 100, "\\%", sep="")
}
mypercentlatex2<-function(theargument, roundlevel=2) {
stopifnot(is.numeric(theargument))
paste(round(theargument, roundlevel) * 100, "\\%", sep="")
}
## Theme for maps
theme.map <- ggplot2::theme(
text = ggplot2::element_text(color = '#444444')
,panel.background = ggplot2::element_rect(fill = '#CCCCCC')
,plot.background = ggplot2::element_rect(fill = '#CCCCCC')
,legend.background = ggplot2::element_rect(fill ='#CCCCCC')
,panel.grid = ggplot2::element_blank()
,plot.title = ggplot2::element_text(size = 18, face='bold')
,plot.subtitle = ggplot2::element_text(size = 12)
,legend.key = ggplot2::element_blank()
,axis.text = ggplot2::element_blank()
,axis.ticks = ggplot2::element_blank()
,axis.title = ggplot2::element_blank()
)
swap <- function(x,i,j) {x[c(i,j)] <- x[c(j,i)]; x}
swap2 <- function(vec, from, to) {
tmp <- to[ match(vec, from) ]
tmp[is.na(tmp)] <- vec[is.na(tmp)]
return(tmp)
}
shift <- function(x, i,j){
x1 <- x[-i]
x2 <- x1[1:(j-1)]
x3 <- c(x2,x[i])
x4 <- x[(j+1):length(x)]
res <- c(x3,x4)
return(res)
}
## see https://github.com/mrdwab/SOfun/blob/master/R/moveMe.R
## and https://frama.link/8Pq0V61u .
## moveMe <- function(invec, movecommand) {
## movecommand <- lapply(strsplit(strsplit(movecommand, ";")[[1]], ",|\\s+"),
## function(x) x[x != ""])
## movelist <- lapply(movecommand, function(x) {
## Where <- x[which(x %in% c("before", "after", "first", "last")):length(x)]
## ToMove <- setdiff(x, Where)
## list(ToMove, Where)
## })
## myVec <- invec
## for (i in seq_along(movelist)) {
## temp <- setdiff(myVec, movelist[[i]][[1]])
## A <- movelist[[i]][[2]][1]
## if (A %in% c("before", "after")) {
## ba <- movelist[[i]][[2]][2]
## if (A == "before") {
## after <- match(ba, temp)-1
## } else if (A == "after") {
## after <- match(ba, temp)
## }
## } else if (A == "first") {
## after <- 0
## } else if (A == "last") {
## after <- length(myVec)
## }
## myVec <- append(temp, values = movelist[[i]][[1]], after = after)
## }
## myVec
## }
round_digit_column <- function(mycol, n){
res <- format(round(mycol, digits=n), nsmall = n)
}
round_preserve_sum <- function(x, digits = 0) {
up <- 10^digits
x <- x * up
y <- floor(x)
indices <- tail(order(x-y), round(sum(x, na.rm=T)) - sum(y, na.rm=T))
y[indices] <- y[indices] + 1
y / up
}
## see http://bit.ly/2yFBRHB.
sum_n_consecutive_entries <- function(v,n){
unname(tapply(v, (seq_along(v)-1) %/% n, sum))
}
# see http://bit.ly/1zbvAti
integer_breaks <- function(n = 5, ...) {
breaker <- pretty_breaks(n, ...)
function(x) {
breaks <- breaker(x)
breaks[breaks == floor(breaks)]
}
}
replace_country_code <- function(mylist){
eu_list <- c("BE", "BG","CZ", "DK", "DE", "EE", "IE", "EL",
"ES", "FR", "IT", "CY", "LV", "LT", "LU", "HU", "MT",
"NL", "AT","PL", "PT", "RO", "SI", "SK", "FI","SE","UK", "HR" )
eu_list <- sort(eu_list)
oecd_list <- c("AUT", "BEL", "BGR", "CYP", "CZE","DEU", "DNK", "EST", "GRC",
"ESP", "FIN", "FRA", "HRV", "HUN", "IRL", "ITA", "LTU",
"LUX", "LVA", "MLT", "NLD", "POL", "PRT", "ROU", "SWE",
"SVN", "SVK", "GBR"
)
for (i in seq(length(oecd_list))){
country <- oecd_list[i]
sel <- which(mylist==country)
mylist[sel] <- eu_list[i]
}
return(mylist)
}
replace_country_code_extended <- function(mylist, eu_list, eu_list_extended){
for (i in seq(length(eu_list_extended))){
country <- eu_list_extended[i]
sel <- which(mylist==country)
mylist[sel] <- eu_list[i]
}
return(mylist)
}
eu_to_iso3 <- function(mylist){
mylist <- as.character(mylist)
eu_list <- c("BE", "BG","CZ", "DK", "DE", "EE", "IE", "EL",
"ES", "FR", "IT", "CY", "LV", "LT", "LU", "HU", "MT",
"NL", "AT","PL", "PT", "RO", "SI", "SK", "FI","SE","UK", "HR" )
eu_list <- sort(eu_list)
oecd_list <- c("AUT", "BEL", "BGR", "CYP", "CZE","DEU", "DNK", "EST", "GRC",
"ESP", "FIN", "FRA", "HRV", "HUN", "IRL", "ITA", "LTU",
"LUX", "LVA", "MLT", "NLD", "POL", "PRT", "ROU", "SWE",
"SVN", "SVK", "GBR"
)
for (i in seq(length(eu_list))){
country <- eu_list[i]
sel <- which(mylist==country)
mylist[sel] <- oecd_list[i]
}
return(mylist)
}
## see http://bit.ly/1XFdh1h
facetAdjust <- function(x, pos = c("up", "down"))
{
pos <- match.arg(pos)
p <- ggplot_build(x)
gtable <- ggplot_gtable(p); dev.off()
dims <- apply(p$panel$layout[2:3], 2, max)
nrow <- dims[1]
ncol <- dims[2]
panels <- sum(grepl("panel", names(gtable$grobs)))
space <- ncol * nrow
n <- space - panels
if(panels != space){
idx <- (space - ncol - n + 1):(space - ncol)
gtable$grobs[paste0("axis_b",idx)] <- list(gtable$grobs[[paste0("axis_b",panels)]])
if(pos == "down"){
rows <- grep(paste0("axis_b\\-[", idx[1], "-", idx[n], "]"),
gtable$layout$name)
lastAxis <- grep(paste0("axis_b\\-", panels), gtable$layout$name)
gtable$layout[rows, c("t","b")] <- gtable$layout[lastAxis, c("t")]
}
}
class(gtable) <- c("facetAdjust", "gtable", "ggplot"); gtable
}
print.facetAdjust <- function(x, newpage = is.null(vp), vp = NULL) {
if(newpage)
grid.newpage()
if(is.null(vp)){
grid.draw(x)
} else {
if (is.character(vp))
seekViewport(vp)
else pushViewport(vp)
grid.draw(x)
upViewport()
}
invisible(x)
}
## See http://bit.ly/1HHSEKL (post by pbible)
cosineDist <- function(x){
as.dist(1 - x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
## See http://bit.ly/1YKuKT4
cos.sim <- function(ix)
{
A = X[ix[1],]
B = X[ix[2],]
return( sum(A*B)/sqrt(sum(A^2)*sum(B^2)) )
}
cos_sim <- function(x,y){
num <- sum(x*y)
denom <- sum(x*x)*sum(y*y)
res <- num/denom
return(res)
}
cos_sim_mat <- function(x,y){
res <- x %*% y / sqrt(x%*%x * y%*%y)
return(res)
}
plot_bunch <- function(data_plot, country_sel, myxlab, myylab, fname){
## country_sel <- c("NL", "UK")
data_plot <- data_plot[data_plot$country %in% country_sel, ]
lbls <- levels(data_plot$country)
col_seq <- seq(length(lbls))
#col_seq[5] <- "brown"
title_exp <- paste("")
gpl <- ggplot(data_plot, aes(x=year, y=value ,
colour=country,
shape=country
)) +
my_ggplot_theme(c(0.13, 0.8))+
theme(legend.position = 'right')+
geom_point(size=3) +
geom_point(size=2.9) +
geom_point(size=2.8) +
geom_point(size=3.1) +
geom_point(size=3.2) +
geom_point(size=3.3) +
geom_line(size=0.5)+
## scale_colour_manual("Countries", breaks=lbls,
## values=col_seq) +
scale_color_solarized(accent = "blue", "Countries")+
scale_shape_manual("Countries", breaks=lbls, values=col_seq
) +
#scale_y_continuous(limits=c(0.3,0.9),breaks=seq(0.3, 0.9, by=0.3))+
scale_y_continuous(breaks=seq(0,3,by=0.2), limits=c(0,3.1))+
## ylim(-0.1,0.1)+
scale_x_continuous(breaks=seq(2000, 2020, by=1))+
labs(title=title_exp)+
theme(plot.title = element_text(lineheight=.8, size=24, face="bold", vjust=1))+
theme(legend.text = element_text(vjust=1,lineheight=1 ))+
theme(legend.title = element_text(vjust=1,lineheight=1, size=12 ))+
xlab(myxlab)+
ylab(myylab)
country_sel <- paste(country_sel,collapse = "_")
fname1 <- paste(fname,country_sel,".pdf", sep="")
ggsave(fname1, gpl, width=10,height=5)
fname2 <- paste(fname,country_sel,".png", sep="")
ggsave(fname2, gpl, width=10,height=5)
}
plot_bunch2 <- function(data_plot, country_sel, myxlab, myylab,xgrid, ygrid,
fname){
## country_sel <- c("NL", "UK")
data_plot <- data_plot[data_plot$country %in% country_sel, ]
lbls <- levels(data_plot$country)
col_seq <- seq(length(lbls))
#col_seq[5] <- "brown"
title_exp <- paste("")
gpl <- ggplot(data_plot, aes(x=year, y=value ,
colour=country,
shape=country
)) +
my_ggplot_theme(c(0.13, 0.8))+
theme(legend.position = 'right')+
geom_point(size=3) +
geom_point(size=2.9) +
geom_point(size=2.8) +
geom_point(size=3.1) +
geom_point(size=3.2) +
geom_point(size=3.3) +
geom_line(size=0.5)+
## scale_colour_manual("Countries", breaks=lbls,
## values=col_seq) +
scale_color_solarized(accent = "blue", "Countries")+
scale_shape_manual("Countries", breaks=lbls, values=col_seq
) +
#scale_y_continuous(limits=c(0.3,0.9),breaks=seq(0.3, 0.9, by=0.3))+
scale_y_continuous(breaks=ygrid, limits=range(ygrid))+
## ylim(-0.1,0.1)+
scale_x_continuous(breaks=xgrid)+
labs(title=title_exp)+
theme(plot.title = element_text(lineheight=.8, size=24, face="bold", vjust=1))+
theme(legend.text = element_text(vjust=1,lineheight=1 ))+
theme(legend.title = element_text(vjust=1,lineheight=1, size=12 ))+
xlab(myxlab)+
ylab(myylab)
country_sel <- paste(country_sel,collapse = "_")
fname1 <- paste(fname,country_sel,".pdf", sep="")
ggsave(fname1, gpl, width=10,height=5)
fname2 <- paste(fname,country_sel,".png", sep="")
ggsave(fname2, gpl, width=10,height=5)
}
find_in_string_vec <- function(str1, str2){
pos_vec <- c()
for ( i in seq(length(str1))){
pos <- grepl(str1[i], str2)
pos <- which(pos==T)
pos_vec <- c(pos_vec, pos)
}
pos_vec <- unique(pos_vec)
return(pos_vec)
}
select_nace <- function(data_exp, thresh, fname){
cum_exp <- cumsum(data_exp$sum)/sum(data_exp$sum)
data <- cbind(data_exp$code, data_exp$sum,data_exp$sum/sum(data_exp$sum)*100, cum_exp*100)
data <- as.data.frame(data)
names(data) <- c("Sector", "Expenditure (MIO NAC)", "% Total Expenditure", "Cumulative % total expenditure" )
write.table(data, fname, col.names=TRUE, row.names=FALSE, sep="," )
myfind <- c(which(cum_exp<thresh),which(cum_exp>=thresh)[1])
sel_sector <- data_exp$code[myfind]
return(sel_sector)
}
sort_expenditure <- function(data1){
seq_indic <- unique(data1$indicator)
sel <- which(seq_indic=="TOTAL")
if (length(sel)>0){
seq_indic <- seq_indic[-sel]
}
mysum <- c()
for (indic in seq_indic){
sel <- which(data1$indicator==indic)
temp <- sum(data1$value[sel])
mysum <- c(mysum,temp)
}
data_out <- as.data.frame(cbind(mysum,seq_indic))
names(data_out) <- c("sum", "code")
data_out$sum <- as.numeric(as.character(data_out$sum))
data_out$code <- as.character(data_out$code)
mysort <- sort(data_out$sum, decreasing =T, index.return=T)
data_out <- data_out[mysort$ix, ]
return(data_out)
}
extract_many_countries <- function(data, clist){
data_out <- data[(data$country %in% clist), ]
return(data_out)
}
replace_country_code <- function(mylist){
eu_list <- c("BE", "BG","CZ", "DK", "DE", "EE", "IE", "EL",
"ES", "FR", "IT", "CY", "LV", "LT", "LU", "HU", "MT",
"NL", "AT","PL", "PT", "RO", "SI", "SK", "FI","SE","UK", "HR" )
eu_list <- sort(eu_list)
oecd_list <- c("AUT", "BEL", "BGR", "CYP", "CZE","DEU", "DNK", "EST", "GRC",
"ESP", "FIN", "FRA", "HRV", "HUN", "IRL", "ITA", "LTU",
"LUX", "LVA", "MLT", "NLD", "POL", "PRT", "ROU", "SWE",
"SVN", "SVK", "GBR"
)
for (i in seq(length(oecd_list))){
country <- oecd_list[i]
sel <- which(mylist==country)
mylist[sel] <- eu_list[i]
}
return(mylist)
}
# see http://bit.ly/1zbvAti
integer_breaks <- function(n = 5, ...) {
breaker <- pretty_breaks(n, ...)
function(x) {
breaks <- breaker(x)
breaks[breaks == floor(breaks)]
}
}
fmt <- function(){
function(x) {
d <- log10(min(diff(x),na.rm=TRUE))
if(d < 0) format(x,nsmall = abs(round(d)),scientific = FALSE) else x
}
}
fmt2 <- function(n){
function(x) format(x,nsmall = n,scientific = FALSE)
}
drop_last_char <- function(t,n){
substr(t, 1, nchar(t)-n)
}
drop_ini_char <- function(t,n){
substr(t, (n+1), nchar(t))
}
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))
}
substrLeft <- function(x, n){
substr(x,1, n )
}
## see https://stackoverflow.com/questions/21878974/auto-wrapping-of-labels-via-labeller-label-wrap-in-ggplot2 https://kutt.it/97KqYV
##Function for plotting
wrapper1 <- function(width) {
function(x) {
lapply(strwrap(x, width = width, simplify = FALSE), paste, collapse="\n")
}
}
##Function to be used independently
wrapper2 <- function(x, width) {
lapply(strwrap(x, width = width, simplify = FALSE), paste, collapse="\n")
}
## See https://stackoverflow.com/questions/64333902/kableextra-after-updating-to-r-4-0-3-and-reinstalling-i-cannot-run-correctly-o
wrapper_text <- function(x, width) {
res <- wrapper2(x, width) |> unlist() |>
linebreak(align="l")
return(res)
}
insert_new_line <- function(x, width){
res <- wrapper2(x, width) |>
unlist()
return(res)
}
compact_list <- function(data_rd, missing=0){
myseq <- seq(length(data_rd))
data2 <- c()
for (i in myseq){
temp <- as.data.frame(data_rd[[i]])
#remove the missing data
if (missing==1){
sel <- complete.cases(temp)
temp_year <- as.numeric(row.names(temp)[sel])
} else{
temp_year <- as.numeric(row.names(temp))
}
namevec <-rep(names(data_rd[i]), length(temp_year))
## temp <- temp[sel, ]
temp2 <- as.data.frame(cbind(temp, temp_year, namevec))
data2 <- rbind(data2, temp2)
}
#data2 <- as.data.frame(data2)
names(data2) <- letters[seq(ncol(data2))]
data2$a <- as.numeric(data2$a)
#print("ncol(data2) is, ")
#print(ncol(data2))
return(data2)
}
#function to save a xtable to a standalone latex file.
wrapLatex <- function(dat, sb=0.7,fil, ...) {
cat(c("\\documentclass{article}\n",
"\\usepackage{graphics}\n",
"\\pagestyle{empty}\n",
"\\begin{document}\n"), file=fil)
print(dat,scalebox = sb, file=fil, append=TRUE, ...)
cat("\\end{document}\n", file=fil, append=TRUE) }
zoo.to.frame <- function(z2, name1="date", name2="value"){
out <- data.frame(Date=time(z2), z2, check.names=FALSE, row.names=NULL)
names(out) <- c(name1, name2)
out[[2]] <- as.numeric(out[[2]])
out <- out[complete.cases(out), ]
return(out)
}
## see http://bit.ly/2in5pSp
fill_forward_mean <- function(x){
ifelse(is.na(x) | is.na(c(0, head(x,-1))),
with(rle(na.locf(x, fromLast=T)), rep(values/lengths, lengths)),
x)
}
fill_forward <- function(x){
ifelse(is.na(x) | is.na(c(0, head(x,-1))),
with(rle(na.locf(x, fromLast=T)), rep(values, lengths)),
x)
}
## a function to separate the items of a *horizontal* legend!
ggplot_padded <- function(x, w = 1){ # w=1 adds 1cm to horizontal space
require(gtable)
if (!gtable::is.gtable(x))
x <- ggplotGrob(x)
get_legend <- function(x){
leg <- which(sapply(x$grobs, function(x) x$name) == "guide-box")
x$grobs[[leg]]
}
g <- get_legend(x)
# CASE-SPECIFIC: WITH 2 LEGEND ITEMS, POSITION TO BE ADJUSTED is 6
# TWEAK THIS ON A CASE-BY-CASE... e.g. ADD LINES BELOW AS NEEDED
# SIMILARLY FOR heights...
g$grobs[[1]]$widths[6] <- g$grobs[[1]]$widths[6] + unit(w,"cm")
x$grobs[x$layout$name == "guide-box"][[1]] <- g
require(grid)
grid.draw(x)
}
my_ggplot_theme <- function(legend_coord){ theme( panel.background = element_rect(fill="gray",
colour = "black", size = 0.5, linetype = 1),
panel.grid.minor = element_blank(),
panel.grid.major= element_line(colour = "white") ,
axis.ticks = element_line(colour = "black", size=1),
axis.ticks.length = unit(0.15, "cm"),
strip.background = element_rect(colour = 'blue',
fill = 'white', size = 1, linetype=1),
strip.text.x = element_text(colour = 'red', angle = 0,
linewidth = 12, hjust = 0.5,
vjust = 0.5, face = 'bold'),
strip.text.y = element_text(colour = 'red', angle = 0,
size = 12, hjust = 0.5,
vjust = 0.5, face = 'bold'),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20, angle=90, vjust=1),
axis.text.x = element_text(size=15, colour="black", vjust=1),
axis.text.y = element_text(size=15, colour="black", hjust=1),
legend.text = element_text(size = 14, vjust=0.4),
legend.title = element_text(size = 24 , hjust=0
),
plot.title = element_text(hjust = 0.5),
legend.position = legend_coord,
## legend.title = element_blank(),
legend.background=element_rect(color=NA, fill=NA),
legend.key = element_rect(colour = NA, fill=NA) )
}
## my_ggplot_theme2 <- function(legend_coord){
## theme_bw()+
## theme(## legend.title = element_text(vjust=1,lineheight=1, size=14 ),
## legend.title = element_text(vjust = 1,lineheight=1,
## size=14, margin = margin(t = 4.5)),
## legend.spacing.y = grid::unit(3, "pt"),
## ## legend.text.align = 0.5,
## ## legend.text = element_text(hjust=0.5),
## panel.grid.minor = element_blank(),
## plot.title = element_text(lineheight=.8, size=24, face="bold",
## vjust=1),
## legend.text = element_text(vjust=.4, hjust=0.5, lineheight=1,size = 14 ),
## axis.title.x = element_text(size = 20, vjust=1),
## axis.title.y = element_text(size = 20, angle=90, vjust=1),
## axis.text.x = element_text(size=15, colour="black", vjust=1),
## axis.text.y = element_text(size=15, colour="black", hjust=1),
## legend.position.inside = legend_coord,
## ## legend.position = legend_coord,
## strip.background = element_rect(colour = 'blue',
## fill = 'white', linewidth = 1, linetype=1),
## strip.text.x = element_text(colour = 'red', angle = 0,
## size = 12, hjust = 0.5,
## vjust = 0.5, face = 'bold'),
## strip.text.y = element_text(colour = 'red', angle = 0,
## size = 12, hjust = 0.5,
## vjust = 0.5, face = 'bold'),
## )
## }
## chatgpt revision
my_ggplot_theme2 <- function(legend_coord) {
theme_bw() +
theme(
# Legend settings
legend.title = element_text(
vjust = 1,
lineheight = 1,
size = 14,
margin = margin(t = 4.5)
),
legend.text = element_text(
vjust = 0.4,
hjust = 0.5,
lineheight = 1,
size = 14
),
legend.spacing.y = grid::unit(3, "pt"),
# Conditionally handle legend position
legend.position = if (is.character(legend_coord)) {
legend_coord
} else if (is.numeric(legend_coord) && length(legend_coord) == 2) {
"none" # Temporarily set a default
} else {
stop("`legend_coord` must be a character or a numeric vector of length 2.")
},
legend.position.inside = if (is.numeric(legend_coord) && length(legend_coord) == 2) {
legend_coord
} else {
NULL
},
# Plot title
plot.title = element_text(
lineheight = 0.8,
size = 24,
face = "bold",
vjust = 1
),
# Axis titles
axis.title.x = element_text(size = 20, vjust = 1),
axis.title.y = element_text(size = 20, angle = 90, vjust = 1),
# Axis text
axis.text.x = element_text(size = 15, colour = "black", vjust = 1),
axis.text.y = element_text(size = 15, colour = "black", hjust = 1),
# Strip settings
strip.background = element_rect(
colour = 'blue',
fill = 'white',
linewidth = 1,
linetype = 1
),
strip.text.x = element_text(
colour = 'red',
size = 12,
hjust = 0.5,
vjust = 0.5,
face = 'bold'
),
strip.text.y = element_text(
colour = 'red',
size = 12,
hjust = 0.5,
vjust = 0.5,
face = 'bold'
),
# Grid lines
panel.grid.minor = element_blank()
)
}
my_ggplot_theme_new <- function(){
theme_bw()+
theme(## legend.title = element_text(vjust=1,lineheight=1, size=14 ),
legend.title = element_text(vjust = 1,lineheight=1,
size=14, margin = margin(t = 4.5)),
legend.spacing.y = grid::unit(3, "pt"),
## legend.text.align = 0.5,
## legend.text = element_text(hjust=0.5),
panel.grid.minor = element_blank(),
plot.title = element_text(lineheight=.8, size=24, face="bold",
vjust=1),
legend.text = element_text(vjust=.4, hjust=0.5, lineheight=1,size = 14 ),
axis.title.x = element_text(size = 20, vjust=1),
axis.title.y = element_text(size = 20, angle=90, vjust=1),
axis.text.x = element_text(size=15, colour="black", vjust=1),
axis.text.y = element_text(size=15, colour="black", hjust=1),
## legend.position.inside = legend_coord,
## legend.position = legend_coord,
strip.background = element_rect(colour = 'blue',
fill = 'white', linewidth = 1, linetype=1),
strip.text.x = element_text(colour = 'red', angle = 0,
size = 12, hjust = 0.5,
vjust = 0.5, face = 'bold'),
strip.text.y = element_text(colour = 'red', angle = 0,
size = 12, hjust = 0.5,
vjust = 0.5, face = 'bold'),
)
}
my_ggplot_theme3 <- function(legend_coord){
theme_gdocs()+
theme(legend.title = element_text(vjust=1,lineheight=1, size=14 ),
plot.title = element_text(lineheight=.8, size=24, face="bold", vjust=1),legend.text = element_text(vjust=.4,lineheight=1,size = 14 ),
axis.title.x = element_text(size = 20, vjust=1),
axis.title.y = element_text(size = 20, angle=90, vjust=1),
axis.text.x = element_text(size=15, colour="black", vjust=1),
axis.text.y = element_text(size=15, colour="black", hjust=1),
legend.position = legend_coord,
strip.background = element_rect(colour = 'blue',
fill = 'white', linewidth = 1, linetype=1),
strip.text.x = element_text(colour = 'red', angle = 0,
size = 12, hjust = 0.5,
vjust = 0.5, face = 'bold'),
strip.text.y = element_text(colour = 'red', angle = 0,
size = 12, hjust = 0.5,
vjust = 0.5, face = 'bold'),
)
}
my_ggplot_theme4 <- ggplot2::theme(panel.border =ggplot2:: element_rect(fill = NA,
colour = "grey10"),
panel.background = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_line(colour = "grey85"),
panel.grid.major = ggplot2::element_line(colour = "grey85"),
panel.grid.major.x = ggplot2::element_line(colour = "grey85"),
axis.text = ggplot2::element_text(size = 13, face = "bold"),
axis.title = ggplot2::element_text(size = 15, face = "bold"),
plot.title = ggplot2::element_text(size = 16, face = "bold"),
strip.text = ggplot2::element_text(size = 16, face = "bold"),
strip.background = ggplot2::element_rect(colour = "black"),
legend.text = ggplot2::element_text(size = 15),
legend.title = ggplot2::element_text(size = 16, face = "bold"),
legend.background = ggplot2::element_rect(fill = "white"),
legend.key = ggplot2::element_rect(fill = "white"))
reshape_series <- function(data, year_cut){
data <- compact_list(data)
data$country <- substrRight(as.character(data$c),2)
names(data) <- c("value", "year", "indicator", "country")
data$indicator <- as.character(data$indicator)
data$value <- as.numeric(data$value)
data <- data[complete.cases(data), ]
sel <- which(data$year>year_cut)
data <- data[sel, ]
return(data)
}
reshape_series_nocut <- function(data){
data <- compact_list(data)
data$country <- substrRight(as.character(data$c),2)
names(data) <- c("value", "year", "indicator", "country")
data$indicator <- as.character(data$indicator)
data$value <- as.numeric(data$value)
data <- data[complete.cases(data), ]
## sel <- which(data$year>year_cut)
## data <- data[sel, ]
return(data)
}
retrieve_ts <- function(sdmx_data){
ts1 <- as.ts(sdmx_data[[1]])
ts1 <- na.omit(ts1)
return(ts1)
}
intersect_ts <- function(ts_list){
ini <- ts_list[[1]]
for (i in seq(length(ts_list)-1)){
ini <- ts.intersect(ini, ts_list[[i+1]])
}
dimnames(ini)[[2]] <- LETTERS[1:length(dimnames(ini)[[2]])]
return(ini)
}
mysarima <-
function(xdata,p,d,q,P=0,D=0,Q=0,S=-1,details=TRUE,xreg=NULL,tol=sqrt(.Machine$double.eps),no.constant=FALSE, filename)
{
trc = ifelse(details==TRUE, 1, 0)
n = length(xdata)
if (is.null(xreg)) {
constant = 1:n
xmean = rep(1,n); if(no.constant==TRUE) xmean=NULL
if (d==0 & D==0) {
fitit = stats::arima(xdata, order=c(p,d,q), seasonal=list(order=c(P,D,Q), period=S, method="ML"),
xreg=xmean,include.mean=FALSE, optim.control=list(trace=trc,REPORT=1,reltol=tol))
} else if (xor(d==1, D==1) & no.constant==FALSE) {
fitit = stats::arima(xdata, order=c(p,d,q), seasonal=list(order=c(P,D,Q), period=S, method="ML"),
xreg=constant,optim.control=list(trace=trc,REPORT=1,reltol=tol))
} else fitit = stats::arima(xdata, order=c(p,d,q), seasonal=list(order=c(P,D,Q), period=S, method="ML"),
include.mean=!no.constant, optim.control=list(trace=trc,REPORT=1,reltol=tol))
}
#
if (!is.null(xreg)) {fitit = stats::arima(xdata, order=c(p,d,q), seasonal=list(order=c(P,D,Q), period=S, method="ML"), xreg=xreg, optim.control=list(trace=trc,REPORT=1,reltol=tol))
}
pdf(filename)
#
# replace tsdiag with a better version
old.par <- par(no.readonly = TRUE)
layout(matrix(c(1,2,4, 1,3,4), ncol=2))
rs <- fitit$residuals
stdres <- rs/sqrt(fitit$sigma2)
num <- sum(!is.na(rs))
plot.ts(stdres, main = "Standardized Residuals", ylab = "")
alag <- 10+sqrt(num)
ACF = stats::acf(rs, alag, plot=FALSE, na.action = na.pass)$acf[-1]
LAG = 1:alag/frequency(xdata)
L=2/sqrt(num)
plot(LAG, ACF, type="h", ylim=c(min(ACF)-.1,min(1,max(ACF+.4))), main = "ACF of Residuals")
abline(h=c(0,-L,L), lty=c(1,2,2), col=c(1,4,4))
stats::qqnorm(stdres, main="Normal Q-Q Plot of Std Residuals"); stats::qqline(stdres, col=4)
nlag <- ifelse(S<4, 20, 3*S)
ppq <- p+q+P+Q
pval <- numeric(nlag)
for (i in (ppq+1):nlag) {u <- stats::Box.test(rs, i, type = "Ljung-Box")$statistic
pval[i] <- stats::pchisq(u, i-ppq, lower.tail=FALSE)}
plot( (ppq+1):nlag, pval[(ppq+1):nlag], xlab = "lag", ylab = "p value", ylim = c(0,
1), main = "p values for Ljung-Box statistic")
abline(h = 0.05, lty = 2, col = "blue")
on.exit(par(old.par))
# end new tsdiag
dev.off()
#
k = length(fitit$coef)
BIC = log(fitit$sigma2)+(k*log(n)/n)
AICc = log(fitit$sigma2)+((n+k)/(n-k-2))
AIC = log(fitit$sigma2)+((n+2*k)/n)
list(fit=fitit, AIC=AIC, AICc=AICc, BIC=BIC)
}
mysarima.for <-
function(xdata,n.ahead,p,d,q,P=0,D=0,Q=0,S=-1,tol=sqrt(.Machine$double.eps),no.constant=FALSE, filename){
xname=deparse(substitute(xdata))
xdata=as.ts(xdata)
n=length(xdata)
constant=1:n
xmean = rep(1,n); if(no.constant==TRUE) xmean=NULL
if (d==0 & D==0) {
fitit=stats::arima(xdata, order=c(p,d,q), seasonal=list(order=c(P,D,Q), period=S, method="ML"),
xreg=xmean,include.mean=FALSE, optim.control=list(reltol=tol));
nureg=matrix(1,n.ahead,1)
} else if (xor(d==1, D==1) & no.constant==FALSE) {
fitit=stats::arima(xdata, order=c(p,d,q), seasonal=list(order=c(P,D,Q), period=S, method="ML"),
xreg=constant,optim.control=list(reltol=tol));
nureg=(n+1):(n+n.ahead)
} else { fitit=stats::arima(xdata, order=c(p,d,q), seasonal=list(order=c(P,D,Q), period=S, method="ML"),
optim.control=list(reltol=tol));
nureg=NULL
}
#--
fore=stats::predict(fitit, n.ahead, newxreg=nureg)
#-- graph:
U = fore$pred + 2*fore$se
L = fore$pred - 2*fore$se
a=max(1,n-100)
minx=min(xdata[a:n],L)
maxx=max(xdata[a:n],U)
t1=xy.coords(xdata, y = NULL)$x
if(length(t1)<101) strt=t1[1] else strt=t1[length(t1)-100]
t2=xy.coords(fore$pred, y = NULL)$x
endd=t2[length(t2)]
xllim=c(strt,endd)
pdf(filename)
ts.plot(xdata,fore$pred,col=1:2, type="o", xlim=xllim, ylim=c(minx,maxx), ylab=xname)
lines(fore$pred, col="red", type="p")
lines(U, col="blue", lty="dashed")
lines(L, col="blue", lty="dashed")
dev.off()
#
return(fore)
}
#################################################
# Automatic distribution fitting and selection
#
# Copyright 2014 worldofpiggy.com
#################################################
## library(MASS)
#Usage:
#data, numeric vector of observations of unknown distribution
#fit, a list of distributions to fit data
#sample, rate of subsampling (0.5 means that a sample 50% of data will be considered)
fitData <- function(data, fit="gamma", sample=0.5){
distrib = list()
numfit <- length(fit)
results = matrix(0, ncol=5, nrow=numfit)
for(i in 1:numfit){
if((fit[i] == "gamma") |
(fit[i] == "poisson") |
(fit[i] == "weibull") |
(fit[i] == "exponential") |
(fit[i] == "logistic") |
(fit[i] == "normal") |
(fit[i] == "geometric")
)
distrib[[i]] = fit[i]
else stop("Provide a valid distribution to fit data" )
}
# take a sample of dataset
n = round(length(data)*sample)
data = sample(data, size=n, replace=F)
for(i in 1:numfit) {
if(distrib[[i]] == "gamma") {
gf_shape = "gamma"
fd_g <- fitdistr(data, "gamma")
est_shape = fd_g$estimate[[1]]
est_rate = fd_g$estimate[[2]]
ks = ks.test(data, "pgamma", shape=est_shape, rate=est_rate)
# add to results
results[i,] = c(gf_shape, est_shape, est_rate, ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "poisson"){
gf_shape = "poisson"
fd_p <- fitdistr(data, "poisson")
est_lambda = fd_p$estimate[[1]]
ks = ks.test(data, "ppois", lambda=est_lambda)
# add to results
results[i,] = c(gf_shape, est_lambda, "NA", ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "weibull"){
gf_shape = "weibull"
fd_w <- fitdistr(data,densfun=dweibull,start=list(scale=1,shape=2))
est_shape = fd_w$estimate[[1]]
est_scale = fd_w$estimate[[2]]
ks = ks.test(data, "pweibull", shape=est_shape, scale=est_scale)
# add to results
results[i,] = c(gf_shape, est_shape, est_scale, ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "normal"){
gf_shape = "normal"
fd_n <- fitdistr(data, "normal")
est_mean = fd_n$estimate[[1]]
est_sd = fd_n$estimate[[2]]
ks = ks.test(data, "pnorm", mean=est_mean, sd=est_sd)
# add to results
results[i,] = c(gf_shape, est_mean, est_sd, ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "exponential"){
gf_shape = "exponential"
fd_e <- fitdistr(data, "exponential")
est_rate = fd_e$estimate[[1]]
ks = ks.test(data, "pexp", rate=est_rate)
# add to results
results[i,] = c(gf_shape, est_rate, "NA", ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "logistic"){
gf_shape = "logistic"
fd_l <- fitdistr(data, "logistic")
est_location = fd_l$estimate[[1]]
est_scale = fd_l$estimate[[2]]
ks = ks.test(data, "plogis", location=est_location, scale=est_scale)
# add to results
results[i,] = c(gf_shape, est_location, est_scale, ks$statistic, ks$p.value)
}
}
results = rbind(c("distribution", "param1", "param2", "ks stat", "ks pvalue"), results)
#print(results)
return(results)
}
## see http://bit.ly/1XFdh1h
facetAdjust <- function(x, pos = c("up", "down"))
{
pos <- match.arg(pos)
p <- ggplot_build(x)
gtable <- ggplot_gtable(p); dev.off()
dims <- apply(p$panel$layout[2:3], 2, max)
nrow <- dims[1]
ncol <- dims[2]
panels <- sum(grepl("panel", names(gtable$grobs)))
space <- ncol * nrow
n <- space - panels
if(panels != space){
idx <- (space - ncol - n + 1):(space - ncol)
gtable$grobs[paste0("axis_b",idx)] <- list(gtable$grobs[[paste0("axis_b",panels)]])
if(pos == "down"){
rows <- grep(paste0("axis_b\\-[", idx[1], "-", idx[n], "]"),
gtable$layout$name)
lastAxis <- grep(paste0("axis_b\\-", panels), gtable$layout$name)
gtable$layout[rows, c("t","b")] <- gtable$layout[lastAxis, c("t")]
}
}
class(gtable) <- c("facetAdjust", "gtable", "ggplot"); gtable
}
mono_scale_discrete <- function(col_name, n_cols){
res <- brewer.pal(n_cols, col_name)
return(res)
}
double_scale_discrete <- function(col_name1, col_name2,n_cols_tot, n_cols1){
res1 <- brewer.pal(n_cols_tot, col_name1)
n_cols2 <- n_cols_tot-n_cols1
res2 <- brewer.pal(n_cols_tot, col_name2)
res <- c(res1[n_cols_tot:(n_cols_tot-n_cols1+1)],
rev(res2[n_cols_tot:(n_cols_tot-n_cols2+1)]))
return(res)
}
## double_scale_discrete <- function(col_name1, col_name2,n_cols_tot, n_cols1){
## res1 <- brewer.pal(n_cols_tot, col_name1)
## n_cols2 <- n_cols_tot-n_cols1
## res2 <- brewer.pal(n_cols_tot, col_name2)
## res <- c(res1[n_cols_tot:(n_cols_tot-n_cols1+1)],
## res2[(n_cols2+1):n_cols_tot])
## return(res)
## }
col_gen <- function(n) {
black <- "#000000"
if (n <= 9) {
c(black,brewer.pal(n-1, "Set2"))
} else {
c(black,hcl(h=seq(0,(n-2)/(n-1),
length=n-1)*360,c=100,l=65,fixup=TRUE))
}
}
create_map <- function(country_list, hi_res,
data_geo,data_val){
if (hi_res==1){
worldmap <- readShapeSpatial("TM_WORLD_BORDERS-0.3.shp")
} else{
worldmap <- readShapeSpatial("TM_WORLD_BORDERS_SIMPL-0.3.shp")
}
worldmap <- fortify(worldmap, region="ISO2")
#NB: remove the constraints on latitude and longitude to plot
# the whole world!!!
## latlimits <- c(30, 75)
## longlimits <- c(-15, 35)
len <- length(worldmap$id)
worldmap$value <- rep(NA, len)
for (i in seq(length(country_list))){
country <- country_list[i]
sel <- which(worldmap$id==country)
## sel2 <- which(gdp_yearly$GEO==country)
sel2 <- which(data_geo==country)
## gdp_val <- gdp_yearly$Value[sel2]
val <- data_val[sel2]
## print("gdp_val is, ")
## print(gdp_val)
## sel <- grep(country, worldmap$group)
worldmap$value[sel] <- rep(val,length(sel)) #regions
#are at 3 levels, and I need to make sure that I am giving the
#same value to every country as a whole (not just different values
# to different areas of the same country.)
}
return(worldmap)
}
## require(XML)
longlat <- function(addrs) {
longlat1 <- function(addr) {
# Attempts to retrieve longitude and latitude from place name
url = paste0("http://maps.google.com/maps/api/geocode/xml?address=",
addr, "&sensor=false")
doc <- NA
r <- c(NA, NA)
try(doc <- xmlTreeParse(url), silent = T)
if (!is.na(doc[1])) {
root = xmlRoot(doc)
long = xmlValue(root[["result"]][["geometry"]][["location"]][["lng"]])
lat = xmlValue(root[["result"]][["geometry"]][["location"]][["lat"]])
r <- c(long, lat)
} else {
print(paste("Error: Could not find", addr))
}
return(as.numeric(r))
}
l <- longlat1(addrs[1])
if (length(addrs) > 1) {
for (i in 2:length(addrs)) {
l <- rbind(l, longlat1(addrs[i]))
}
}
return(l)
}
space <- function(v, percent = 10) {
# Provide an extra margin for adding text to a plot
r <- percent/100 * (max(v) - min(v))
return(c(min(v) - r, max(v) + r))
}
remove_link_stat_all_norm <- function(g,data_agg){
counter <- 0
n_edges <- length(E(g))
n_v <- length(V(g))
link_rem <- seq(n_edges)
fragment_sizes <- c()
ratio_sizes <- c()
rg_all <- c()
rel_diff <- c()
## large_frag <- c()
for (i in link_rem){
g2 <- delete.edges(g, E(g)[i])
myfrag <- clusters(g2)$csize
if (length(myfrag)!=2){
## print(myfrag)
counter <- counter+1
myfrag <- c(n_v,0)}
myfrag <- sort(myfrag)
## fragment_sizes <- c(fragment_sizes,temp)} else{
fragment_sizes <- c(fragment_sizes,myfrag)
## large_frag <- c(large_frag, max(myfrag))
## ratio <- my_frag
## sort(myfrag)
ratio <- myfrag[1]/myfrag[2] ## ratio[1]/ratio[2]
ratio_sizes <- c(ratio_sizes, ratio)
mydiff <- abs(diff(myfrag))
rel_diff <- c(rel_diff,mydiff)
member <- clusters(g2)$membership
rg_norm <- get_aggregate_Rg(data_agg,member)
rg_all <- c(rg_all, rg_norm)
}
print("The number of links whose removal does not break the aggregate is, ")
print(counter)
print("and the percentage of the overall number of links is, ")
print((counter/n_edges*100))
fragment_sizes_norm <- fragment_sizes/n_v
## large_frag <- large_frag/n_v
rel_diff <- rel_diff/n_v
res <- list(fragment_sizes, fragment_sizes_norm, ratio_sizes,rel_diff, rg_all )
return(res)
}
remove_link_stat_all <- function(g){
n_edges <- length(E(g))
link_rem <- seq(n_edges)
fragment_sizes <- c()
for (i in link_rem){
g2 <- delete.edges(g, E(g)[i])
myfrag <- clusters(g2)$csize
if (length(myfrag)!=2){
print(myfrag)
temp <- c(0, length(V(g)))
fragment_sizes <- c(fragment_sizes,temp)} else{
fragment_sizes <- c(fragment_sizes,sort(myfrag))}
}
return(fragment_sizes)
}
#Modification of the function above to keep only the smaller fragment.
remove_link_stat_small <- function(g){
n_edges <- length(E(g))
link_rem <- seq(n_edges)
print("the number of nodes is, ")
print(vcount(g))
fragment_sizes <- c()
for (i in link_rem){
g2 <- delete.edges(g, E(g)[i])
myfrag <- clusters(g2)$csize
if (length(myfrag)!=2){
## print(myfrag)
temp <- c(0)
fragment_sizes <- c(fragment_sizes,temp)} else{
fragment_sizes <- c(fragment_sizes,min(myfrag))}
}
return(fragment_sizes)
}
### this function returns a tibble of ordered fragments.
remove_link_stat_all2 <- function(g){
n_edges <- length(E(g))
link_rem <- seq(n_edges)
fragment_sizes <- c()
for (i in link_rem){
g2 <- delete.edges(g, E(g)[i])
myfrag <- clusters(g2)$csize
if (length(myfrag)!=2){
print(myfrag)
temp <- c(0, length(V(g)))
fragment_sizes <- c(fragment_sizes,temp)} else{
fragment_sizes <- c(fragment_sizes,sort(myfrag))}
}
res <- matrix(fragment_sizes, ncol=2, byrow=T) |> as_tibble(name_repair="unique") |>
set_colnames(c("n1", "n2"))
return(res)
}
## function to read some positions and change them into a distance matrix
## and finally a network.
get_aggregate <- function(fname,threshold){
data<-read.table(fname)
dmat<-dist(data,method="euclidean", diag=T, upper=T)
dmat <- as.matrix(dmat)
g <- graph.adjacency(dmat<threshold,mode="undirected",weighted=NULL)
## g <- graph.adjacency(dmat,mode="undirected",weighted=TRUE)
g <- igraph::simplify(g)
return(g)
}
get_aggregate2 <- function(fname,threshold){
data<-read_table2(fname, col_names=F)
dmat<-dist(data,method="euclidean", diag=T, upper=T)
dmat <- as.matrix(dmat)
g <- graph.adjacency(dmat<threshold,mode="undirected",weighted=NULL)
## g <- graph.adjacency(dmat,mode="undirected",weighted=TRUE)
g <- igraph::simplify(g)
return(g)
}
get_frequence <- function(contact){
freq <- table(contact)
#convert table into a matrix
freq <- matrix(c(as.numeric(names(freq)), freq), ncol=length(freq), byrow=TRUE, dimnames=NULL)
freq <- t(freq)
freq <- as.data.frame(freq)
names(freq) <- c("value", "count")
return (freq)
}
drop_last_char <- function(t,n){
substr(t, 1, nchar(t)-n)
}
drop_ini_char <- function(t,n){
substr(t, (n+1), nchar(t))
}
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))
}
substrLeft <- function(x, n){
substr(x,1, n )
}
reshape_series <- function(data, year_cut){
data <- compact_list(data)
data$country <- substrRight(as.character(data$c),2)
names(data) <- c("value", "year", "indicator", "country")
data$indicator <- as.character(data$indicator)
data$value <- as.numeric(data$value)
data <- data[complete.cases(data), ]
sel <- which(data$year>year_cut)
data <- data[sel, ]
return(data)
}
golden <- function(){
golden_ratio <- 1.618
return(golden_ratio)
}
rmnan <- function(data){
if (length(dim(data))!=2){
data <- data[complete.cases(data)]
} else{
data <- data[complete.cases(data), ]
}
return(data)
}
addlinetoplot <- function(dataset, varx, vary) {
list(
geom_line(data=dataset, aes_string(x=varx, y=vary)),
geom_point(data=dataset, aes_string(x=varx, y=vary))
)
}
## See http://bit.ly/1nFohep
## And then your plotting code looks like:
## pltbase <- ggplot() + geom_line(data = df1, aes(x=c1, y=c2))
## pltbase + addlinetoplot(df2, varx = "c1", vary = "csq")
## Functions for orthogonal regression (total least square regression when the
## variance of x and y is the same.
## have a look at http://bit.ly/1PyWK4B and http://bit.ly/1RDAPuA )
tls <- function(X,y){
v <- prcomp(cbind(X,y))$rotation
beta <- -v[-ncol(v),ncol(v)] / v[ncol(v),ncol(v)]
## beta <- as.numeric(beta)
return(beta)
}
tls_beta0 <- function(X,y, beta){
beta0 <- mean(y)-sum(colMeans(X)*beta)
## beta0 <- as.numeric(beta0)
return(beta0)
}
get_world_map <- function(map_location_name){
## require(gpclib) ## no longer needed
## gpclibPermit()
worldmap <- readShapeSpatial(map_location_name)
w_ori <- worldmap
worldmap <- fortify(worldmap)
id_list <- unique(worldmap$id)
worldmap$country <- worldmap$id
iso_list <- unique(w_ori$ISO3)
for (i in seq(length(iso_list))){
sel <- which(worldmap$id==id_list[i])
worldmap$country[sel] <- as.character(iso_list[i])
}
return(worldmap)
}
findCorrelation_fast <- function(x, cutoff = .90, verbose = FALSE){
if(any(!complete.cases(x)))
stop("The correlation matrix has some missing values.")
averageCorr <- colMeans(abs(x))
averageCorr <- as.numeric(as.factor(averageCorr))
x[lower.tri(x, diag = TRUE)] <- NA
combsAboveCutoff <- which(abs(x) > cutoff)
colsToCheck <- ceiling(combsAboveCutoff / nrow(x))
rowsToCheck <- combsAboveCutoff %% nrow(x)
colsToDiscard <- averageCorr[colsToCheck] > averageCorr[rowsToCheck]
rowsToDiscard <- !colsToDiscard
if(verbose){
colsFlagged <- pmin(ifelse(colsToDiscard, colsToCheck, NA),
ifelse(rowsToDiscard, rowsToCheck, NA), na.rm = TRUE)
values <- round(x[combsAboveCutoff], 3)
cat('\n',paste('Combination row', rowsToCheck, 'and column', colsToCheck,
'is above the cut-off, value =', values,
'\n \t Flagging column', colsFlagged, '\n'
))
}
deletecol <- c(colsToCheck[colsToDiscard], rowsToCheck[rowsToDiscard])
deletecol <- unique(deletecol)
deletecol
}
findCorrelation_exact <- function(x, cutoff = 0.90, verbose = FALSE)
{
varnum <- dim(x)[1]
if (!isTRUE(all.equal(x, t(x)))) stop("correlation matrix is not symmetric")
if (varnum == 1) stop("only one variable given")
x <- abs(x)
# re-ordered columns based on max absolute correlation
originalOrder <- 1:varnum
averageCorr <- function(x) mean(x, na.rm = TRUE)
tmp <- x
diag(tmp) <- NA
maxAbsCorOrder <- order(apply(tmp, 2, averageCorr), decreasing = TRUE)
x <- x[maxAbsCorOrder, maxAbsCorOrder]
newOrder <- originalOrder[maxAbsCorOrder]
rm(tmp)
deletecol <- rep(FALSE, varnum)
x2 <- x
diag(x2) <- NA
for (i in 1:(varnum - 1)) {
if(!any(x2[!is.na(x2)] > cutoff)){
if (verbose) cat("All correlations <=", cutoff, "\n")
break()
}
if (deletecol[i]) next
for (j in (i + 1):varnum) {
if (!deletecol[i] & !deletecol[j]) {
if (x[i, j] > cutoff) {
mn1 <- mean(x2[i,], na.rm = TRUE)
mn2 <- mean(x2[-j,], na.rm = TRUE)
if(verbose) cat("Compare row", newOrder[i],
" and column ", newOrder[j],
"with corr ", round(x[i,j], 3), "\n")
if (verbose) cat(" Means: ", round(mn1, 3), "vs", round(mn2, 3))
if (mn1 > mn2) {
deletecol[i] <- TRUE
x2[i, ] <- NA
x2[, i] <- NA
if (verbose) cat(" so flagging column", newOrder[i], "\n")
}
else {
deletecol[j] <- TRUE
x2[j, ] <- NA
x2[, j] <- NA
if (verbose) cat(" so flagging column", newOrder[j], "\n")
}
}
}
}
}
newOrder[which(deletecol)]
}
findCorrelation <- function(x, cutoff = 0.90, verbose = FALSE, names = FALSE, exact = ncol(x) < 100) {
if(names & is.null(colnames(x)))
stop("'x' must have column names when `names = TRUE`")
out <- if(exact)
findCorrelation_exact(x = x, cutoff = cutoff, verbose = verbose) else
findCorrelation_fast(x = x, cutoff = cutoff, verbose = verbose)
out
if(names) out <- colnames(x)[out]
out
}
topK <- function(x,k, myname="other"){
tbl <- tabulate(x)
names(tbl) <- levels(x)
x <- as.character(x)
levelsToKeep <- names(tail(sort(tbl),k))
x[!(x %in% levelsToKeep)] <- myname
factor(x)
}
force_same_levels <- function(x, levelsToKeep){
x <- as.character(x)
x[!(x %in% levelsToKeep)] <- 'other'
factor(x)
}
## New not in function. See http://bit.ly/2ms96ZY
'%!in%' <- function(x,y)!('%in%'(x,y))
##function for moving averages
## see http://bit.ly/2mOHN8c
## and avoid filter being overwritten by dplyr
ma <- function(x,n=5){stats::filter(x,rep(1/n,n), sides=1)}
#change all the factors to characters
# see http://bit.ly/2nCx8B5
to_char <- function(df){
## res <- df %>% mutate_if(is.factor, as.character)
## res <- df %>% mutate(across(where(is.factor), ~as.character(.x)))
res <- df |> mutate(across(where(is.factor), \(x) as.character(x)))
return(res)
}
to_num <- function(df){
## res <- df %>% mutate_if(is.character, as.numeric)
## res <- df %>% mutate(across(where(is.character), ~as.numeric(.x)))
res <- df |> mutate(across(where(is.character), \(x) as.numeric(x)))
return(res)
}
to_int <- function(df){
## res <- df %>% mutate_if(is.character, as.integer)
## res <- df %>% mutate(across(where(is.character), ~as.integer(.x)))
res <- df |> mutate(across(where(is.character), \(x) as.integer(x)))
return(res)
}
############ to avoid troubles when saving numbers
num_to_char <- function(df){
## res <- df %>% mutate_if(is.numeric, as.character )
## res <- df %>% mutate(across(where(is.numeric), ~as.character(.x) ))
res <- df |> mutate(across(where(is.numeric), \(x) as.character(x) ))
return(res)
}
all_to_lower <- function(df){
## res <- df %>% mutate(across(where(is.character), tolower) )
res <- df |> mutate(across(where(is.character), \(x) tolower(x)) )
return(res)
}
all_to_upper <- function(df){
## res <- df %>% mutate(across(where(is.character), toupper) )
res <- df |> mutate(across(where(is.character), \(x) toupper(x)) )
return(res)
}
## all_to_lower <- function(df){
## res <- df %>% mutate_if(is.character, tolower)
## return(res)
## }
to_csv <- function(df, ...){
df <- num_to_char(df)
write_csv(df, ...)
}
save_csv <- function(data, fname){
write.table(data,
fname,
row.names=FALSE, col.names=TRUE, sep=",")
}
## see https://frama.link/cjjbZY8L
round_all <- function(df, n){
## res <- df %>% mutate_if(is.numeric, list(~round(., n)) )
## res <- df %>% mutate(across(where(is.numeric), ~round(.x,n) ))
res <- df |> mutate(across(where(is.numeric), \(x) round(x,n) ))
return(res)
}
round_cols <- function(df, mycols, n){
## res <- df %>%
## mutate(across({{mycols}},~round(.x,n) ))
res <- df |>
mutate(across({{mycols}},\(x) round(x,n) ))
return(res)
}
#### a function to convert all the numbers to text with a given number of decimals
## format_all <- function(df, n, my_sep=" "){
## myf <- paste("%.",n,"f", sep="")
## ## sprintf("%.1f")
## res <- round_all(df, n) %>%
## mutate(across(where(is.numeric),~sprintf(myf,.x)))
## ## mutate_if(is.numeric,list(~sprintf(myf,.)))
## return(res)
## }
format_all <- function(df, n, my_sep=" "){
## res <- df %>%
## mutate(across(where(is.numeric), ~format(round(.x, n), nsmall = n, big.mark= my_sep, trim=TRUE )))
res <- df |>
mutate(across(where(is.numeric), \(x) format(round(x, n), nsmall = n, big.mark= my_sep, trim=TRUE )))
return(res)
}
format_all_preserve_sum <- function(df, n, my_sep=" "){
## res <- df %>%
## mutate(across(where(is.numeric), ~format(round_preserve_sum(.x, n), nsmall = n, big.mark= my_sep, trim=TRUE )))
res <- df |>
mutate(across(where(is.numeric), \(x) format(round_preserve_sum(x, n), nsmall = n, big.mark= my_sep, trim=TRUE )))
return(res)
}
format_col <- function(x, n, my_sep=" "){
res <- format(round(x, n), nsmall = n, big.mark= my_sep, trim=TRUE )
return(res)
}
format_col_percentage <- function(x, n, my_sep=" "){
res <- format(round(x, n), nsmall = n, big.mark= my_sep, trim=TRUE ) |>
paste("%", sep="")
return(res)
}
format_col_preserve_sum <- function(x, n, my_sep=" "){
res <- format(round_preserve_sum(x, n), nsmall = n, big.mark= my_sep, trim=TRUE )
return(res)
}
format_col_preserve_sum_percentage <- function(x, n, my_sep=" "){
res <- format(round_preserve_sum(x, n), nsmall = n, big.mark= my_sep, trim=TRUE ) |>
paste("%", sep="")
return(res)
}
add_percentage <- function(x){
res <- paste(x,"%", sep="")
return(res)
}
## format_cols <- function(df, mycols, n){
## myf <- paste("%.",n,"f", sep="")
## ## sprintf("%.1f")
## res <- round_cols(df,{{mycols}}, n) %>%
## mutate(across({{mycols}},~sprintf(myf,.x)))
## return(res)
## }
format_cols <- function(df, mycols, n, my_sep=" "){
## res <- round_cols(df,{{mycols}}, n) %>%
## mutate(across({{mycols}},~format(.x, nsmall = n,
## big.mark= my_sep, trim=TRUE )))
res <- round_cols(df,{{mycols}}, n) |>
mutate(across({{mycols}},\(x) format(x, nsmall = n,
big.mark= my_sep, trim=TRUE )))
return(res)
}
format_cols_preserve_sum <- function(df, mycols, n, my_sep=" "){
## res <- round_cols(df,{{mycols}}, n) %>%
## mutate(across({{mycols}},~format_col_preserve_sum(.x, nsmall = n,
## big.mark= my_sep, trim=TRUE )))
res <- round_cols(df,{{mycols}}, n) |>
mutate(across({{mycols}}, \(x) format_col_preserve_sum(x, nsmall = n,
big.mark= my_sep, trim=TRUE )))
return(res)
}
## ugly function which does not work well at all.
## round_all_digits <- function(df, n){
## res <- df %>% mutate_if(is.numeric, list( ~ format(round(., digits=n),
## nsmall = n)) )
## return(res)
## }
## define a helper function to replace every empty space as a na.
## See http://bit.ly/2pItlEi
empty_as_na <- function(x){
if("factor" %in% class(x)) x <- as.character(x) ## since ifelse wont work with factors
ifelse(as.character(x)!="", x, NA)
}
#### generalize to treat a certain pattern as a na
mypattern_as_na <- function(x, mypattern){
if("factor" %in% class(x)) x <- as.character(x) ## since ifelse wont work with factors
ifelse(as.character(x)!=mypattern, x, NA)
}
## I need a curried function (already partially applied) if I want to use it inside
## mutate_each ---> see http://bit.ly/2pIlkzs
mypattern_as_na_curried <- function(x){
mypattern_as_na(x, mypattern)
}
na_cols <- function(mymatrix){
res <- colnames(mymatrix)[colSums(is.na(mymatrix)) > 0]
return(res)
}
## complete_data <- function(data){
## res <- data %>% filter(complete.cases(.))
## return(res)
## }
## See https://stackoverflow.com/a/72801062/2952838
complete_data <- function(data){
res <- data |> (\(x) filter(x, complete.cases(!!x)))()
return(res)
}
#I define an alias
complete_cases <- complete_data
markov_chain <- function(ini_state,x,N){
y <- numeric(N)
y[1] <- ini_state
for(i in 2:N) {
y[i]=which(rmultinom(1, size = 1, prob = x[y[i-1], ])==1) }
return(y)
}
## see http://bit.ly/1aRcBxJ
simMarkov <- function(ini_state, P, len=1000) {
n <- NROW(P)
result <- numeric(len)
result[1] <- ini_state
for (i in 2:len) {
result[i] <- sample(1:n, 1, prob=P[ result[i-1], ])
}
result
}
# This function returns TRUE wherever elements are the same, including NA's,
# and FALSE everywhere else.
compareNA <- function(v1,v2) {
same <- (v1 == v2) | (is.na(v1) & is.na(v2))
same[is.na(same)] <- FALSE
return(same)
}
## Change a missing value to a certain pattern
na_to_pattern <- function(df, pattern){
if (is.numeric(pattern)){
res <- df |>
mutate(across(where(is.numeric),
\(x) replace(x, is.na(x), pattern)))
} else if (is.character(pattern)) {
res <- df |>
mutate(across(where(is.character), \(x) replace(x, is.na(x), pattern)))
} else { print ("error") }
return(res)
}
na_to_pattern_col <- function(x, pattern){
res <- replace(x, is.na(x), pattern)
return(res)
}
na_num_to_pattern <- function(df, pattern){
## res <- df %>%
## mutate(across(where(is.numeric), ~replace(.x, is.na(.x), pattern)))
res <- df |>
mutate(across(where(is.numeric),
\(x) replace(x, is.na(x), pattern) ))
return(res)
}
## and the specific cases for a numeric and character NA.
na_char_to_pattern <- function(df, pattern){
## res <- df %>%
## mutate(across(where(is.character), ~replace(.x, is.na(.x), pattern)))
res <- df |>
mutate(across(where(is.character), \(x) replace(x, is.na(x), pattern)))
return(res)
}
## the other way around, consider a certain pattern as an NA
## See https://github.com/tidyverse/dplyr/issues/6645#issuecomment-1397035215
pattern_to_na <- function(df, pattern){
## res <- df %>% na_if(., pattern)
## res <- df |> (\(x) na_if(x, pattern))()
## across(where(is.character), \(x) na_if(x, "D"))
res <- df |>
mutate(across(where(is.character), \(x) na_if(x, pattern)))
return(res)
}
number_to_na <- function(df, number=0){
res <- df |>
mutate(across(where(is.numeric), \(x) na_if(x, number)))
return(res)
}
## generic function to replace a pattern to another
## (works only for an exact match)
pattern_to_pattern <- function(df, pattern1, pattern2){
## res <- df %>% replace(.,.== pattern1, pattern2)
if (!is.na(pattern1)){
## res <- df |> (\(x) replace(x,x== pattern1, pattern2))()
res <- df |>
mutate(across(where(is.character), \(x) replace(x,x== pattern1, pattern2)))
} else{
res <- df |> na_to_pattern(pattern2)
}
return(res)
}
## this works also for a partial match
search_replace <- function(df, pattern_search, pattern_replace){
## res <- df %>%
## mutate(across(where(is.character),
## stringr::str_replace_all, pattern = pattern_search,
## replacement = pattern_replace))
res <- df |>
mutate(across(where(is.character),
\(x) stringr::str_replace_all(x, pattern = pattern_search,
replacement = pattern_replace)))
return(res)
}
## search_replace <- function(df, pattern_search, pattern_replace){
## x <- df %>%
## mutate_if(is.character,
## stringr::str_replace_all, pattern = pattern_search,
## replacement = pattern_replace)
## }
#### round all the numbers to a certain precision
## see https://kutt.it/Du2GzW
round_all_to_n <- function(df, n){
## res <- df %>% mutate_if(is.numeric, round, n)
## res <- df %>% mutate(across(where(is.numeric), round, n))
res <- df |> mutate(across(where(is.numeric), \(x) round(x,n)))
return(res)
}
round_all_to_n_preserve_sum <- function(df, n){
## res <- df %>% mutate_if(is.numeric, round_preserve_sum, n)
## res <- df %>% mutate(across(where(is.numeric), round_preserve_sum, n))
res <- df |> mutate(across(where(is.numeric), \(x) round_preserve_sum(x,n)))
return(res)
}
## count the number of missing value per column
count_na <- function(df){
## df %>% summarise_all(funs(sum(is.na(.))))
## df %>% summarise(across(everything(), ~sum(is.na(.x))))
df |> summarise(across(everything(), \(x) sum(is.na(x))))
}
count_na_col <- function(x){
## x %>% is.na %>% sum
x |> is.na() |> sum()
}
## as above, but to count the number of unique elements
count_unique <- function(df){
## df %>% summarise_all(funs(sum(is.na(.))))
## df %>% summarise(across(everything(), ~length(unique(.x))))
res <- df |> summarise(across(everything(), \(x) length(unique(x))))
return(res)
}
count_unique_col <- function(x){
## df %>% summarise_all(funs(sum(is.na(.))))
## x %>% unique %>% length
x |> unique() |> length()
}
matrix_plotting <- function(data, title, labels_x, labels_y, name,
digit_num,col_num,
pos1=6.3,pos2=1,pos3=5.5,pos4=3,h=5,w=5){
tikz(name,standAlone=T, width=w, height=h)
oldpar<-par( mar=c(7,4,4,6) ,
cex.axis=1.4,cex.lab=1.6,cex.main=1.6)
testcol<-color.gradient(c(0.2,1),c(0.2,0.5),c(0,0),nslices=col_num)
my_min_mat <- min(data)
col.labels<-c(formatC(min(data),format="f",digits=digit_num),
formatC(max(data),format="f",digits=digit_num))
color2D.matplot(data,main=title,c(0.2,1),c(0.2,0.5),c(0,0),
xlab = "",
ylab="",
show.legend=FALSE,
show.values=0,vcol="black",vcex=1, axes=FALSE)
axis(1,at=seq(0.5,(length(labels_x)-.1), by=1),labels=labels_x)
axis(2,at=rev(seq(0.5,(length(labels_y)-.1), by=1)),
labels=labels_y, las=1 )
color.legend(pos1,pos2,pos3,pos4,col.labels,testcol,align="rb",
gradient="y", col="white")
for(i in 1:(length(labels_x))) {
for(j in 1:length(labels_y)) {
text(i-0.5,j-0.5,
formatC(data[(length(labels_y)+1-j),i],
format="f",digits=digit_num),
col="white")
}
}
box()
par(oldpar)
dev.off()
tools::texi2dvi(name,pdf=T)
}
# Function to calculate first-order Markov transition matrix.
# Each *row* corresponds to a single run of the Markov chain
trans.matrix <- function(X, prob=T)
{
tt <- table( c(X[,-ncol(X)]), c(X[,-1]) )
if(prob) tt <- tt / rowSums(tt)
tt
}
CI_markov_correct <- function(data, N_rep){
res_list <- list()
mat_list <- list()
count_mat <- trans.matrix(data, prob=F)
count_list <- rowSums(count_mat)
prob_mat <- trans.matrix(data, prob=T) #just the transition matrix
temp_mat <- count_mat
ncol <- dim(count_mat)[2]
nrow <- dim(count_mat)[1]
seq_count <- seq(ncol)
seq_to_sample <- seq(ncol)
for (m in seq(N_rep)){
for (i in seq(nrow)){
bootstrap <- sample(seq_to_sample,size=count_list[i],
replace=T,prob=prob_mat[i,])
for (k in seq(ncol)){
seq_count[k] <- length(which(bootstrap==seq_to_sample[k]))
}
temp_mat[i, ] <- seq_count
}
mat_list[[m]] <- temp_mat/rowSums(temp_mat)
}
CI_05_mat <- mat_list[[1]]
CI_95_mat <- mat_list[[1]]
ncol <- dim(CI_05_mat)[2]
nrow <- dim(CI_05_mat)[1]
myseq <- seq(N_rep)
for (i in seq(nrow)){
for (j in seq(ncol)){
for (m in seq(N_rep)){
mymat <- mat_list[[m]]
myseq[m] <- mymat[i,j]
}
CI_05_mat[i,j] <- quantile(myseq, 0.025)
CI_95_mat[i,j] <- quantile(myseq, 0.975)
}
}
res_list[[1]] <- CI_05_mat
res_list[[2]] <- CI_95_mat
return(res_list)
}
CI_markov <- function(data, N_rep){
res_list <- list()
mat_list <- list()
data_samp <- data
ncol <- dim(data)[2]
for (i in seq(N_rep)){
for (j in seq(ncol)){
data_samp[, j] <- sample(data[,j], replace=TRUE)
}
t_mat <- trans.matrix(data_samp)
mat_list[[i]] <- t_mat
}
CI_05_mat <- mat_list[[1]]
CI_95_mat <- mat_list[[1]]
ncol <- dim(CI_05_mat)[2]
nrow <- dim(CI_05_mat)[1]
## print("ncol and nrow are")
## print(ncol)
## print(nrow)
myseq <- seq(N_rep)
for (i in seq(nrow)){
for (j in seq(ncol)){
for (m in seq(N_rep)){
mymat <- mat_list[[m]]
myseq[m] <- mymat[i,j]
}
CI_05_mat[i,j] <- quantile(myseq, 0.025)
CI_95_mat[i,j] <- quantile(myseq, 0.975)
}
}
res_list[[1]] <- CI_05_mat
res_list[[2]] <- CI_95_mat
return(res_list)
}
## a function to remove duplicated columns (see http://bit.ly/2s0q6cC )
rem_dupl_cols <- function(df){
res <- df[, !duplicated(t(df))]
return(res)
}
## a function to remove constant columns (see http://bit.ly/2s1qqaY
## and http://bit.ly/2s1Sb3q . The solution in the second link is better
## because it works also with factors and characters )
rem_const_cols <- function(df){
## res <- df[,apply(df, 2, var, na.rm=TRUE) != 0]
## res <- df[sapply(df, function(x) length(unique(na.omit(x)))) > 1]
## res <- df %>%
## select(where(~length(unique(na.omit(.x))) > 1))
res <- df |>
select(where(\(x) length(unique(na.omit(x))) > 1))
return(res)
}
###########################
## Some functions for clustering
distmat <- function(data, dist_method){
cols_NA <- colnames(data)[colSums(is.na(data)) > 0]
cols_not_NA <- setdiff(names(data), cols_NA)
data_red <- subset(data, select= cols_not_NA)
data_red <- as.matrix(data_red)
data_red <- t(data_red)
distMatrix <- dist(data_red, method=dist_method)
return(distMatrix)
}
cluster_data <- function(data, dist_method, cluster_method){
cols_NA <- colnames(data)[colSums(is.na(data)) > 0]
cols_not_NA <- setdiff(names(data), cols_NA)
data_red <- subset(data, select= cols_not_NA)
data_red <- as.matrix(data_red)
data_red <- t(data_red)
distMatrix <- dist(data_red, method=dist_method)
hc <- hclust(distMatrix, method=cluster_method)
return(hc)
}
#Mode function
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
##########################################################
##Function to calculate multiple lags
lag_calc <- function(x, select_exp, n){
select_exp_enq <- enquo(select_exp)
xx <- x %>% select(!!select_exp_enq)
df1 <- stats::lag(zoo(xx), 0:-n) %>%
as_tibble
names(df1) <- make.names(names(df1))
res <- x %>% bind_cols(df1)
sel <- grepl("lag0", colnames(res))
res <- res[ , !sel]
return(res)
}
#Miscellaneous dplyr functions -- see https://edwinth.github.io/blog/dplyr-recipes/
bare_to_quo <- function(x, var){
x %>% select(!!var) %>% head(1)
}
## bare_to_quo(mtcars, quo(cyl))
bare_to_quo_in_func <- function(x, var) {
var_enq <- enquo(var)
x %>% select(!!var_enq) %>% head(1)
}
## bare_to_quo_in_func(mtcars, mpg)
bare_to_name <- function(x, nm) {
nm_name <- quo_name(nm)
x %>% mutate(!!nm_name := 42) %>% head(1) %>%
select(!!nm)
}
## bare_to_name(mtcars, quo(this_is_42))
quo_to_text <- function(x, var) {
var_enq <- enquo(var)
glue::glue("The following column was selected: {rlang::quo_text(var_enq)}")
}
## quo_to_text(mtcars, cyl)
char_to_quo <- function(x, var) {
var_enq <- rlang::sym(var)
x %>% select(!!var_enq) %>% head(1)
}
## char_to_quo(mtcars, "vs")
bare_to_quo_mult <- function(x, ...) {
grouping <- quos(...)
x %>% group_by(!!!grouping) %>% summarise(nr = n())
}
## bare_to_quo_mult(mtcars, vs, cyl)
bare_to_quo_mult_chars <- function(x, ...) {
grouping <- rlang::syms(...)
x %>% group_by(!!!grouping) %>% summarise(nr = n())
}
## bare_to_quo_mult_chars(mtcars, list("vs", "cyl"))
filter_func <- function(x, filter_exp) {
filter_exp_enq <- enquo(filter_exp)
x %>% filter(!!filter_exp_enq)
}
## filter_func(mtcars, hp == 93)
filter_by_char <- function(x, char) {
func_call <- rlang::parse_expr(char)
x %>% filter(!!func_call)
}
## filter_by_char(mtcars, "cyl == 6") %>% head(1)
select_func <- function(x, select_exp) {
select_exp_enq <- enquo(select_exp)
x %>% select(!!select_exp_enq)
}
####################################################################
####################################################################
####################################################################
####################################################################
ym <- function (..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME")) {
lubridate:::.parse_xxx(..., orders = "ym", quiet = quiet, tz = tz, locale = locale,
truncated = 0)
}
my <- function (..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME")) {
lubridate:::.parse_xxx(..., orders = "my", quiet = quiet, tz = tz, locale = locale,
truncated = 0)
}
###############################################à
###############################################à
###############################################à
##Function to calculate the growth on the extremes of a sequence
## (useful when combined with rollify to have a moving growth rate)
growth_time <- function(x){
a <- x[1]
b <- tail(x,1)
g_rate <- (b-a)/a
return(g_rate)
}
growth_rate <- function(x){
res <- growth_time(x)
return(res)
}
## See https://github.com/tidyverse/dplyr/issues/2960
#' Simulate the group_by/mutate pattern with an explicit summarize and join.
#'
#' Group a data frame by the groupingVars and compute user summaries on
#' this data frame (user summaries specified in ...), then join these new
#' columns back into the original data and return to the user.
#' This works around https://github.com/tidyverse/dplyr/issues/2960 .
#' And it is a demonstration of a higher-order dplyr verb.
#' Author: John Mount, Win-Vector LLC.
#'
#' @param d data.frame
#' @param groupingVars character vector of column names to group by.
#' @param ... list of dplyr::mutate() expressions.
#' @value d with grouped summaries added as extra columns
#'
#' @examples
#'
#' add_group_summaries(mtcars,
#' c("cyl", "gear"),
#' group_mean_mpg = mean(mpg),
#' group_mean_disp = mean(disp)) %>%
#' head()
#'
#' @export
#'
add_group_summaries <- function(d, groupingVars, ...) {
# convert char vector into quosure vector
# worked out after reading http://dplyr.tidyverse.org/articles/programming.html
# no idea if this will be stable across rlang/tidyeval versions
groupingQuos <- lapply(groupingVars,
function(si) { quo(!!as.name(si)) })
# print(groupingQuos)
dg <- group_by(d, !!!groupingQuos)
ds <- summarize(dg, ...)
# Work around: https://github.com/tidyverse/dplyr/issues/2963
ds <- ungroup(ds)
left_join(d, ds, by= groupingVars)
}
##function to remove some specific words from a string.
## See https://kutt.it/ZYL1M7 and solution by Joao
remove_words <- function(x, .stopwords) {
## x %>%
## stringr::str_split(" ") %>%
## purrr::flatten_chr() %>%
## setdiff(.stopwords) %>%
## stringr::str_c(collapse = " ")
x |>
stringr::str_split(" ") |>
purrr::flatten_chr() |>
setdiff(.stopwords) |>
stringr::str_c(collapse = " ")
}
##I use the curly curly notation to write a function with map !!!
remove_words_vectorized <- function(str, .stopwords){
purrr::map_chr(str, remove_words, {{.stopwords}})
}
#A function to measure the variability of a vector. See https://kutt.it/ga9RWN
l2_norm <- function(x){
x <- x/sum(x)
d <- length(x)
n <- sqrt(sum(x^2))
res <- (n*sqrt(d)-1)/(sqrt(d)-1)
return(res)
}
###########################################################################à###########################################################################à###########################################################################à###########################################################################à###########################################################################à
## I add some codes which are always useful
iso2 <- structure(list(code = c("AF", "AX", "AL", "DZ", "AS", "AD", "AO",
"AI", "AQ", "AG", "AR", "AM", "AW", "AU", "AT", "AZ", "BS", "BH",
"BD", "BB", "BY", "BE", "BZ", "BJ", "BM", "BT", "BO", "BQ", "BA",
"BW", "BV", "BR", "IO", "BN", "BG", "BF", "BI", "KH", "CM", "CA",
"CV", "KY", "CF", "TD", "CL", "CN", "CX", "CC", "CO", "KM", "CG",
"CD", "CK", "CR", "CI", "HR", "CU", "CW", "CY", "CZ", "DK", "DJ",
"DM", "DO", "EC", "EG", "SV", "GQ", "ER", "EE", "ET", "FK", "FO",
"FJ", "FI", "FR", "GF", "PF", "TF", "GA", "GM", "GE", "DE", "GH",
"GI", "GR", "GL", "GD", "GP", "GU", "GT", "GG", "GN", "GW", "GY",
"HT", "HM", "VA", "HN", "HK", "HU", "IS", "IN", "ID", "IR", "IQ",
"IE", "IM", "IL", "IT", "JM", "JP", "JE", "JO", "KZ", "KE", "KI",
"KP", "KR", "XK", "KW", "KG", "LA", "LV", "LB", "LS", "LR", "LY",
"LI", "LT", "LU", "MO", "MK", "MG", "MW", "MY", "MV", "ML", "MT",
"MH", "MQ", "MR", "MU", "YT", "MX", "FM", "MD", "MC", "MN", "ME",
"MS", "MA", "MZ", "MM", "NA", "NR", "NP", "NL", "AN", "NC", "NZ",
"NI", "NE", "NG", "NU", "NF", "MP", "NO", "OM", "PK", "PW", "PS",
"PA", "PG", "PY", "PE", "PH", "PN", "PL", "PT", "PR", "QA", "RS",
"RE", "RO", "RU", "RW", "BL", "SH", "KN", "LC", "MF", "PM", "VC",
"WS", "SM", "ST", "SA", "SN", "CS", "SC", "SL", "SG", "SX", "SK",
"SI", "SB", "SO", "ZA", "GS", "SS", "ES", "LK", "SD", "SR", "SJ",
"SZ", "SE", "CH", "SY", "TW", "TJ", "TZ", "TH", "TL", "TG", "TK",
"TO", "TT", "TN", "TR", "XT", "TM", "TC", "TV", "UG", "UA", "AE",
"GB", "US", "UM", "UY", "UZ", "VU", "VE", "VN", "VG", "VI", "WF",
"EH", "YE", "ZM", "ZW"), country = c("Afghanistan", "Aland Islands",
"Albania", "Algeria", "American Samoa", "Andorra", "Angola",
"Anguilla", "Antarctica", "Antigua and Barbuda", "Argentina",
"Armenia", "Aruba", "Australia", "Austria", "Azerbaijan", "Bahamas",
"Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize",
"Benin", "Bermuda", "Bhutan", "Bolivia", "Bonaire, Sint Eustatius and Saba",
"Bosnia and Herzegovina", "Botswana", "Bouvet Island", "Brazil",
"British Indian Ocean Territory", "Brunei Darussalam", "Bulgaria",
"Burkina Faso", "Burundi", "Cambodia", "Cameroon", "Canada",
"Cape Verde", "Cayman Islands", "Central African Republic", "Chad",
"Chile", "China", "Christmas Island", "Cocos (Keeling) Islands",
"Colombia", "Comoros", "Congo", "Congo, The Democratic Republic of",
"Cook Islands", "Costa Rica", "Cote d'Ivoire", "Croatia", "Cuba",
"Curaçao", "Cyprus", "Czechia", "Denmark", "Djibouti", "Dominica",
"Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea",
"Eritrea", "Estonia", "Ethiopia", "Falkland Islands (Malvinas)",
"Faroe Islands", "Fiji", "Finland", "France", "French Guiana",
"French Polynesia", "French Southern Territories", "Gabon", "Gambia",
"Georgia", "Germany", "Ghana", "Gibraltar", "Greece", "Greenland",
"Grenada", "Guadeloupe", "Guam", "Guatemala", "Guernsey", "Guinea",
"Guinea-Bissau", "Guyana", "Haiti", "Heard and Mc Donald Islands",
"Holy See (Vatican City State)", "Honduras", "Hong Kong", "Hungary",
"Iceland", "India", "Indonesia", "Iran, Islamic Republic of",
"Iraq", "Ireland", "Isle of Man", "Israel", "Italy", "Jamaica",
"Japan", "Jersey", "Jordan", "Kazakhstan", "Kenya", "Kiribati",
"Korea, Democratic People's Republic of", "South Korea",
"Kosovo (temporary code)", "Kuwait", "Kyrgyzstan", "Lao, People's Democratic Republic",
"Latvia", "Lebanon", "Lesotho", "Liberia", "Libyan Arab Jamahiriya",
"Liechtenstein", "Lithuania", "Luxembourg", "Macao", "Macedonia, The Former Yugoslav Republic Of",
"Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta",
"Marshall Islands", "Martinique", "Mauritania", "Mauritius",
"Mayotte", "Mexico", "Micronesia, Federated States of", "Moldova, Republic of",
"Monaco", "Mongolia", "Montenegro", "Montserrat", "Morocco",
"Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands",
"Netherlands Antilles", "New Caledonia", "New Zealand", "Nicaragua",
"Niger", "Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands",
"Norway", "Oman", "Pakistan", "Palau", "Palestinian Territory, Occupied",
"Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines",
"Pitcairn", "Poland", "Portugal", "Puerto Rico", "Qatar", "Republic of Serbia",
"Reunion", "Romania", "Russia", "Rwanda", "Saint Barthélemy",
"Saint Helena", "Saint Kitts & Nevis", "Saint Lucia", "Saint Martin",
"Saint Pierre and Miquelon", "Saint Vincent and the Grenadines",
"Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia",
"Senegal", "Serbia and Montenegro", "Seychelles", "Sierra Leone",
"Singapore", "Sint Maarten", "Slovakia", "Slovenia", "Solomon Islands",
"Somalia", "South Africa", "South Georgia & The South Sandwich Islands",
"South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Svalbard and Jan Mayen",
"Swaziland", "Sweden", "Switzerland", "Syrian Arab Republic",
"Taiwan, Province of China", "Tajikistan", "Tanzania, United Republic of",
"Thailand", "Timor-Leste", "Togo", "Tokelau", "Tonga", "Trinidad and Tobago",
"Tunisia", "Turkey", "Turkish Rep N Cyprus (temporary code)",
"Turkmenistan", "Turks and Caicos Islands", "Tuvalu", "Uganda",
"Ukraine", "United Arab Emirates", "United Kingdom", "United States",
"United States Minor Outlying Islands", "Uruguay", "Uzbekistan",
"Vanuatu", "Venezuela", "Vietnam", "Virgin Islands, British",
"Virgin Islands, U.S.", "Wallis and Futuna", "Western Sahara",
"Yemen", "Zambia", "Zimbabwe")), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -253L), spec = structure(list(
cols = list(code = structure(list(), class = c("collector_character",
"collector")), country = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
iso3 <- structure(list(country = c("Afghanistan", "Albania", "Algeria",
"American Samoa", "Andorra", "Angola", "Anguila", "Antigua and Barbuda",
"Argentina", "Armenia", "Aruba", "Australia", "Austria", "Azerbaijan",
"Bahamas, The", "Bahrain", "Bangladesh", "Barbados", "Belarus",
"Belgium", "Belgium-Luxembourg", "Belize", "Benin", "Bermuda",
"Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Br. Antr. Terr",
"Brazil", "British Indian Ocean Ter.", "British Virgin Islands",
"Brunei", "Bulgaria", "Burkina Faso", "Burundi", "Cambodia",
"Cameroon", "Canada", "Cape Verde", "Cayman Islands", "Central African Republic",
"Chad", "Chile", "China", "Christmas Island", "Cocos (Keeling) Islands",
"Colombia", "Comoros", "Congo, Dem. Rep.", "Congo, Rep.", "Cook Islands",
"Costa Rica", "Cote d'Ivoire", "Croatia", "Cuba", "Cyprus", "Czechia",
"Czechoslovakia", "Denmark", "Djibouti", "Dominica", "Dominican Republic",
"East Timor", "Ecuador", "Egypt, Arab Rep.", "El Salvador", "Equatorial Guinea",
"Eritrea", "Estonia", "Ethiopia (excludes Eritrea)", "Ethiopia (includes Eritrea)",
"European Union", "Faeroe Islands", "Falkland Island", "Fiji",
"Finland", "Fm Panama Cz", "Fm Rhod Nyas", "Fm Tanganyik", "Fm Vietnam Dr",
"Fm Vietnam Rp", "Fm Zanz-Pemb", "Fr. So. Ant. Tr", "France",
"Free Zones", "French Guiana", "French Polynesia", "Gabon", "Gambia, The",
"Gaza Strip", "Georgia", "German Democratic Republic", "Germany",
"Ghana", "Gibraltar", "Greece", "Greenland", "Grenada", "Guadeloupe",
"Guam", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti",
"Holy See", "Honduras", "Hong Kong, China", "Hungary", "Iceland",
"India", "Indonesia", "Iran, Islamic Rep.", "Iraq", "Ireland",
"Israel", "Italy", "Jamaica", "Japan", "Jhonston Island", "Jordan",
"Kazakhstan", "Kenya", "Kiribati", "Korea, Dem. Rep.", "Korea, Rep.",
"Kuwait", "Kyrgyz Republic", "Lao PDR", "Latvia", "Lebanon",
"Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania",
"Luxembourg", "Macao", "Macedonia, FYR", "Madagascar", "Malawi",
"Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands",
"Martinique", "Mauritania", "Mauritius", "Mexico", "Micronesia, Fed. Sts.",
"Midway Islands", "Moldova", "Monaco", "Mongolia", "Montserrat",
"Morocco", "Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal",
"Netherlands", "Netherlands Antilles", "Neutral Zone", "New Caledonia",
"New Zealand", "Nicaragua", "Niger", "Nigeria", "Niue", "Norfolk Island",
"Northern Mariana Islands", "Norway", "Oman", "Pacific Islands",
"Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay",
"Pen Malaysia", "Peru", "Philippines", "Pitcairn", "Poland",
"Portugal", "Puerto Rico", "Qatar", "Reunion", "Romania", "Russia",
"Rwanda", "Ryukyu Is", "Sabah", "Saint Helena", "Saint Kitts-Nevis-Anguilla-Aru",
"Saint Pierre and Miquelon", "Samoa", "San Marino", "Sao Tome and Principe",
"Sarawak", "Saudi Arabia", "Senegal", "Seychelles", "Sierra Leone",
"SIKKIM", "Singapore", "Slovak Republic", "Slovenia", "Solomon Islands",
"Somalia", "South Africa", "Soviet Union", "Spain", "Special Categories",
"Sri Lanka", "St. Kitts and Nevis", "St. Lucia", "St. Vincent and the Grenadines",
"Sudan", "Suriname", "Svalbard and Jan Mayen Is", "Swaziland",
"Sweden", "Switzerland", "Syrian Arab Republic", "Taiwan", "Tajikistan",
"Tanzania", "Thailand", "Togo", "Tokelau", "Tonga", "Trinidad and Tobago",
"Tunisia", "Turkey", "Turkmenistan", "Turks and Caicos Isl.",
"Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom",
"United States", "Unspecified", "Uruguay", "Us Msc.Pac.I", "Uzbekistan",
"Vanuatu", "Venezuela", "Vietnam", "Virgin Islands (U.S.)", "Wake Island",
"Wallis and Futura Isl.", "Western Sahara", "World", "Yemen Democratic",
"Yemen, Rep.", "Yugoslavia", "Yugoslavia, FR (Serbia/Montene",
"Zambia", "Zimbabwe"), iso3 = c("AFG", "ALB", "DZA", "ASM", "AND",
"AGO", "AIA", "ATG", "ARG", "ARM", "ABW", "AUS", "AUT", "AZE",
"BHS", "BHR", "BGD", "BRB", "BLR", "BEL", "BLX", "BLZ", "BEN",
"BMU", "BTN", "BOL", "BIH", "BWA", "BAT", "BRA", "IOT", "VGB",
"BRN", "BGR", "BFA", "BDI", "KHM", "CMR", "CAN", "CPV", "CYM",
"CAF", "TCD", "CHL", "CHN", "CXR", "CCK", "COL", "COM", "ZAR",
"COG", "COK", "CRI", "CIV", "HRV", "CUB", "CYP", "CZE", "CSK",
"DNK", "DJI", "DMA", "DOM", "TMP", "ECU", "EGY", "SLV", "GNQ",
"ERI", "EST", "ETH", "ETF", "EUN", "FRO", "FLK", "FJI", "FIN",
"PCZ", "ZW1", "TAN", "VDR", "SVR", "ZPM", "ATF", "FRA", "FRE",
"GUF", "PYF", "GAB", "GMB", "GAZ", "GEO", "DDR", "DEU", "GHA",
"GIB", "GRC", "GRL", "GRD", "GLP", "GUM", "GTM", "GIN", "GNB",
"GUY", "HTI", "VAT", "HND", "HKG", "HUN", "ISL", "IND", "IDN",
"IRN", "IRQ", "IRL", "ISR", "ITA", "JAM", "JPN", "JTN", "JOR",
"KAZ", "KEN", "KIR", "PRK", "KOR", "KWT", "KGZ", "LAO", "LVA",
"LBN", "LSO", "LBR", "LBY", "LIE", "LTU", "LUX", "MAC", "MKD",
"MDG", "MWI", "MYS", "MDV", "MLI", "MLT", "MHL", "MTQ", "MRT",
"MUS", "MEX", "FSM", "MID", "MDA", "MCO", "MNG", "MSR", "MAR",
"MOZ", "MMR", "NAM", "NRU", "NPL", "NLD", "ANT", "NZE", "NCL",
"NZL", "NIC", "NER", "NGA", "NIU", "NFK", "MNP", "NOR", "OMN",
"PCE", "PAK", "PLW", "PAN", "PNG", "PRY", "PMY", "PER", "PHL",
"PCN", "POL", "PRT", "PRI", "QAT", "REU", "ROU", "RUS", "RWA",
"RYU", "SBH", "SHN", "KN1", "SPM", "WSM", "SMR", "STP", "SWK",
"SAU", "SEN", "SYC", "SLE", "SIK", "SGP", "SVK", "SVN", "SLB",
"SOM", "ZAF", "SVU", "ESP", "SPE", "LKA", "KNA", "LCA", "VCT",
"SDN", "SUR", "SJM", "SWZ", "SWE", "CHE", "SYR", "TWN", "TJK",
"TZA", "THA", "TGO", "TKL", "TON", "TTO", "TUN", "TUR", "TKM",
"TCA", "TUV", "UGA", "UKR", "ARE", "GBR", "USA", "UNS", "URY",
"USP", "UZB", "VUT", "VEN", "VNM", "VIR", "WAK", "WLF", "ESH",
"WLD", "YDR", "YEM", "SER", "YUG", "ZMB", "ZWE"), isonum = c(4,
8, 12, 16, 20, 24, 660, 28, 32, 51, 533, 36, 40, 31, 44, 48,
50, 52, 112, 56, 58, 84, 204, 60, 64, 68, 70, 72, 80, 76, 86,
92, 96, 100, 854, 108, 116, 120, 124, 132, 136, 140, 148, 152,
156, 162, 166, 170, 174, 180, 178, 184, 188, 384, 191, 192, 196,
203, 200, 208, 262, 212, 214, 626, 218, 818, 222, 226, 232, 233,
231, 230, 918, 234, 238, 242, 246, 592, 717, 835, 868, 866, 836,
260, 250, 838, 254, 258, 266, 270, 274, 268, 278, 276, 288, 292,
300, 304, 308, 312, 316, 320, 324, 624, 328, 332, 336, 340, 344,
348, 352, 356, 360, 364, 368, 372, 376, 380, 388, 392, 396, 400,
398, 404, 296, 408, 410, 414, 417, 418, 428, 422, 426, 430, 434,
438, 440, 442, 446, 807, 450, 454, 458, 462, 466, 470, 584, 474,
478, 480, 484, 583, 488, 498, 492, 496, 500, 504, 508, 104, 516,
520, 524, 528, 530, 536, 540, 554, 558, 562, 566, 570, 574, 580,
578, 512, 582, 586, 585, 591, 598, 600, 459, 604, 608, 612, 616,
620, 630, 634, 638, 642, 643, 646, 647, 461, 654, 658, 666, 882,
674, 678, 457, 682, 686, 690, 694, 698, 702, 703, 705, 90, 706,
710, 810, 724, 839, 144, 659, 662, 670, 736, 740, 744, 748, 752,
756, 760, 158, 762, 834, 764, 768, 772, 776, 780, 788, 792, 795,
796, 798, 800, 804, 784, 826, 840, 898, 858, 849, 860, 548, 862,
704, 850, 872, 876, 732, 0, 720, 887, 891, 890, 894, 716)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -264L), spec = structure(list(
cols = list(country = structure(list(), class = c("collector_character",
"collector")), iso3 = structure(list(), class = c("collector_character",
"collector")), isonum = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
## #####list of eu countries
## eu_list <- c(
## "austria", "italy",
## "belgium", "latvia",
## "bulgaria", "lithuania",
## "croatia", "luxembourg",
## "cyprus", "malta",
## "czech republic", "netherlands",
## "denmark", "poland",
## "estonia", "portugal",
## "finland", "romania",
## "france", "slovakia",
## "germany", "slovenia",
## "greece", "spain",
## "hungary", "sweden",
## "ireland", "united kingdom"
## )
## A function to list all the files with a certain extension in a folder
extract_file_list <- function(file_path, extension, full_path=1){
## my_files <- file_path %>%
## list.files() %>%
## .[str_detect(., extension)]
## if (full_path==1){
## my_files2 <- paste(file_path, my_files, sep="")
## } else {
## my_files2 <- my_files
## }
## See https://kpress.dev/blog/2022-06-19-replacing-the-magrittr-pipe-with-native-r-pipe/
my_files <- file_path |>
list.files() |>
(\(x) x[str_detect(x, extension) ])()
if (full_path==1){
my_files2 <- paste(file_path, my_files, sep="")
} else {
my_files2 <- my_files
}
return(my_files2)
}
#### a function to get a network from the coordinates of a cluster
get_network_from_table <- function(fname,threshold){
data<-read_table2(fname, col_names=F)
dmat<-dist(data,method="euclidean", diag=T, upper=T)
dmat <- as.matrix(dmat)
g <- graph.adjacency(dmat<threshold,mode="undirected",weighted=NULL)
g <- igraph::simplify(g)
return(g)
}
### this function does not need to directly read the data
get_network_from_table2 <- function(data,threshold){
dmat<-dist(data,method="euclidean", diag=T, upper=T)
dmat <- as.matrix(dmat)
g <- graph.adjacency(dmat<threshold,mode="undirected",weighted=NULL)
g <- igraph::simplify(g)
return(g)
}
##a function to get the fragment distribution of a cluster
fragment_cluster <- function(g){
n_links=E(g)
myseq <- seq(n_links)
frag_dist <- c()
for (i in myseq){
g_broken <- delete.edges(g,i)
fragments <- clusters(g_broken)$csize
if(length(fragments)==1){
fragments=c(0, fragments)
}
fragments_tibble <- tibble(n1=fragments[1], n2=fragments[2])
## print(fragments_tibble)
frag_dist <- bind_rows(frag_dist, fragments_tibble)
}
frag_dist <- frag_dist |>
mutate(e1=if_else(n1>=n2, n1, n2),
e2=if_else(n1<=n2, n1, n2)) |> ## %>%
mutate(n1=e1,n2=e2) |> ## %>%
select(-c(e1,e2))
return(frag_dist)
}
## Now I provide a list of bonds to remove one at the time (sample with replacement)
fragment_cluster_sel_bonds <- function(g, myseq){
n_links=E(g)
## myseq <- seq(n_links)
frag_dist <- c()
for (i in myseq){
g_broken <- delete.edges(g,i)
fragments <- clusters(g_broken)$csize
if(length(fragments)==1){
fragments=c(0, fragments)
}
fragments_tibble <- tibble(n1=fragments[1], n2=fragments[2])
## print(fragments_tibble)
frag_dist <- bind_rows(frag_dist, fragments_tibble)
}
frag_dist <- frag_dist %>%
mutate(e1=if_else(n1>=n2, n1, n2),
e2=if_else(n1<=n2, n1, n2)) |> ## %>%
mutate(n1=e1,n2=e2) |> ## %>%
select(-c(e1,e2))
return(frag_dist)
}
#####A generalisation of the previous calculation
fragment_cluster_prob <- function(g, probs, n_rem){
n_links=E(g)
mysamp <- seq(n_links)
myseq <- sample(mysamp, n_rem,replace=T, probs)
frag_dist <- c()
for (i in myseq){
g_broken <- delete.edges(g,i)
fragments <- clusters(g_broken)$csize
if(length(fragments)==1){
fragments=c(0, fragments)
## print("No frag!!!")
}
fragments_tibble <- tibble(n1=fragments[1], n2=fragments[2])
## print(fragments_tibble)
frag_dist <- bind_rows(frag_dist, fragments_tibble)
}
frag_dist <- frag_dist |> ## %>%
mutate(e1=if_else(n1>=n2, n1, n2),
e2=if_else(n1<=n2, n1, n2)) |> ## %>%
mutate(n1=e1,n2=e2) |> ## %>%
select(-c(e1,e2))
return(frag_dist)
}
fragment_cluster_prob2 <- function(g, probs, n_rep){
n_links=E(g)
mysamp <- seq(n_links)
n_rem <- sum(probs>0)*n_rep
myseq <- sample(mysamp, n_rem,replace=T, probs)
frag_dist <- c()
for (i in myseq){
g_broken <- delete.edges(g,i)
fragments <- clusters(g_broken)$csize
if(length(fragments)==1){
fragments=c(0, fragments)
## print("No frag!!!")
}
fragments_tibble <- tibble(n1=fragments[1], n2=fragments[2])
## print(fragments_tibble)
frag_dist <- bind_rows(frag_dist, fragments_tibble)
}
frag_dist <- frag_dist |> ## %>%
mutate(e1=if_else(n1>=n2, n1, n2),
e2=if_else(n1<=n2, n1, n2)) |> ## %>%
mutate(n1=e1,n2=e2) |> ## %>%
select(-c(e1,e2))
return(frag_dist)
}
### a function to remove all the nodes above a certain degree
remove_nodes_high_deg <- function(g3,k){
sel3 <- which(degree(g3)>=k)
g3_broken <- g3 |> ## %>%
delete.vertices(sel3)
clusters_g3_broken <- clusters(g3_broken)$csize |> ## %>%
enframe(name = NULL)
return(clusters_g3_broken)
}
### remove special characters see https://kutt.it/rb6Tn4
## superseeded by remove_special_char
## remove_special_characters <- function(x, pattern=""){
## res <- str_replace_all(x, "[^[:alnum:]]", pattern)
## return(res)
## }
remove_special_character<- function(x, pattern=""){
remove_special_char(x, pattern)
}
###function to normalise all the numeric columns of a tibble
normalize_cols <- function(dat){
res <- dat |> ## %>%
## mutate_if(is.numeric, function(x) x/sum(x))
mutate(across(where(is.numeric), \(x) x/sum(x)))
return(res)
}
## and alternative spelling
normalise_cols <- normalize_cols
###now with a power
normalize_cols_power <- function(dat, power){
res <- dat |> ## %>%
## mutate_if(is.numeric, function(x) x^power/sum(x^power))
mutate(across(where(is.numeric), \(x) x^power/sum(x^power)))
return(res)
}
normalise_cols_power <- normalize_cols_power
#' Reorder an x or y axis within facets
#'
#' Reorder a column before plotting with faceting, such that the values are ordered
#' within each facet. This requires two functions: \code{reorder_within} applied to
#' the column, then either \code{scale_x_reordered} or \code{scale_y_reordered} added
#' to the plot.
#' This is implemented as a bit of a hack: it appends ___ and then the facet
#' at the end of each string.
#'
#' @param x Vector to reorder.
#' @param by Vector of the same length, to use for reordering.
#' @param within Vector of the same length that will later be used for faceting
#' @param fun Function to perform within each subset to determine the resulting
#' ordering. By default, mean.
#' @param sep Separator to distinguish the two. You may want to set this manually
#' if ___ can exist within one of your labels.
#' @param ... In \code{reorder_within} arguments passed on to \code{\link{reorder}}.
#' In the scale functions, extra arguments passed on to
#' \code{\link[ggplot2]{scale_x_discrete}} or \code{\link[ggplot2]{scale_y_discrete}}.
#'
#' @source "Ordering categories within ggplot2 Facets" by Tyler Rinker:
#' \url{https://trinkerrstuff.wordpress.com/2016/12/23/ordering-categories-within-ggplot2-facets/}
#'
#' @examples
#'
#' library(tidyr)
#' library(ggplot2)
#'
#' iris_gathered <- gather(iris, metric, value, -Species)
#'
#' # reordering doesn't work within each facet (see Sepal.Width):
#' ggplot(iris_gathered, aes(reorder(Species, value), value)) +
#' geom_boxplot() +
#' facet_wrap(~ metric)
#'
#' # reorder_within and scale_x_reordered work.
#' # (Note that you need to set scales = "free_x" in the facet)
#' ggplot(iris_gathered, aes(reorder_within(Species, value, metric), value)) +
#' geom_boxplot() +
#' scale_x_reordered() +
#' facet_wrap(~ metric, scales = "free_x")
#'
#' @export
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
stats::reorder(new_x, by, FUN = fun)
}
#' @rdname reorder_within
#' @export
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
#' @rdname reorder_within
#' @export
scale_y_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...)
}
## convert_to_utf <- function(x){
## res <- ## iconv(x, "latin1", "UTF-8",sub='') %>%
## stri_trans_general(x, "latin-ascii")
## }
## convert_to_utf <- function(x){
## Encoding(x) <- "latin1"
## res <- stri_trans_general(x, "latin-ascii")
## }
convert_to_utf <- function(x){
res <- stringi::stri_encode(x, "UTF-8")
}
## a function to find the exact words in a vector string
find_exact_matches <- function(keywords, text_string ){
kk1 <- paste("\\b",keywords, sep="")
kk2 <- paste(kk1, "\\b|", sep="")
kk <- paste(kk2, collapse="") |> ## %>%
drop_last_char(1)
res <- str_detect(text_string,kk)
}
replace_exact_matches <- function(keywords, text_string, replacement ){
kk1 <- paste("\\b",keywords, sep="")
kk2 <- paste(kk1, "\\b|", sep="")
kk <- paste(kk2, collapse="") |> ## %>%
drop_last_char(1)
res <- str_replace_all(text_string,kk, replacement)
}
find_matches <- function(kk2, text_string ){
kk <- paste(kk2, collapse="|")
## print("kk is")
## print(kk)
res <- str_detect(text_string, kk)
}
##functions to append data to an excel file and to add a sheet to an excel file.
append_xlsx <- function(df_for_workbook, out_xlsx, name_worksheet)
if (!file.exists(out_xlsx)) {
# Create workbook using openxlsx
wb <- createWorkbook()
# Add worksheet
addWorksheet(wb, name_worksheet)
# Write data frame to new worksheet
writeData(wb, name_worksheet, df_for_workbook)
# Save file
saveWorkbook(wb, file = out_xlsx)
} else {
# Read in existing data
old_wb <-
readWorkbook(out_xlsx,
sheet = name_worksheet,
detectDates = TRUE)
# Append new data
new_data <-
bind_rows(old_wb,
df_for_workbook)
# Load and write updated data frame to existing worksheet
wb <- loadWorkbook(out_xlsx)
writeData(wb, name_worksheet, new_data)
# Save file
saveWorkbook(wb, out_xlsx, overwrite = TRUE)
}
add_sheet_xlsx <- function(df, filename, sheet_name){
wb <- loadWorkbook(filename)
addWorksheet(wb, sheet_name)
writeData(wb, sheet = sheet_name, df,keepNA = TRUE)
saveWorkbook(wb,filename,overwrite = T)
}
## function to wrap long text labels in ggplot2.
## See https://stackoverflow.com/questions/21878974/auto-wrapping-of-labels-via-labeller-label-wrap-in-ggplot2
wrap_label <-function(width) {
function(x){
str_wrap(x, width)}
}
## See https://stackoverflow.com/questions/38291794/extract-string-before
select_left_pattern <- function(x, mypattern){
newpattern <- paste("\\", mypattern, sep="")
res <- word(x,1,sep = newpattern)
return(res)
}
select_right_pattern <- function(x, mypattern){
newpattern <- paste("\\", mypattern, sep="")
res <- word(x,2,-1,sep = newpattern)
return(res)
}
### A function to rescale a vector between 0 and 1
## See https://stackoverflow.com/questions/5665599/range-standardization-0-to-1-in-r
scale01 <- function(x, ...){(x - min(x, ...)) / (max(x, ...) - min(x, ...))}
scale_ab <- function(x,a,b, ...){
mm <- min(a,b)
interval <- abs(b-a)
res <- scale01(x, ...)*interval+mm
}
## remove_diacritics <- function(x){
## res <- iconv(x,from = 'UTF-8', to="ASCII//TRANSLIT")
## res <- str_trim(res, side ="both")
## }
remove_diacritics <- function(df){
res <- stri_trans_general(df, "latin-ascii")
return(res)
}
latin_to_ascii <- function(df){
res <- remove_diacritics(df)
return(res)
}
remove_diacritics_convert_utf <- function(x){
## res <- remove_diacritics(x)
## res <- convert_to_utf(res)
res <- x |> ## %>%
remove_diacritics() |> ## %>%
convert_to_utf()
return(res)
}
greek_to_latin <- function(x){
res <- stri_trans_general(x, "cyrillic-latin")
return(res)
}
cyrillic_to_latin <- function(x){
res <- stri_trans_general(x, "greek-latin")
return(res)
}
## ###an auxiliary function to remove short words from a vector of strings
## remove_short_words_aux <- function(x, n){
## mypattern <- paste("\\w{", n, ",}", sep="")
## x2 <- paste(str_extract_all(x, mypattern)[[1]], collapse=' ')
## }
## ## and here is the main function
## remove_short_words <- function(x,n){
## res<-map(x, function(x) remove_short_words_aux(x,n)) %>%
## unlist ## %>%
## ## tibble::enframe(name = NULL)
## }
remove_short_words <- function(x, n){
mypattern <- paste0("\\w{", n, ",}")
sapply(str_extract_all(x, mypattern), paste, collapse=' ')
}
## A replacement for the two functions above
## See https://stackoverflow.com/questions/60626029/map2-df-and-named-arguments/67666104#67666104
###and this is a function to only find and keep, as a single vector, all the unique words above a certain length.
find_long_words_unique <- function(x,n, pattern){
## require(magrittr)
res <- str_split(x,pattern) |> ## %>%
unlist() |> ## %>%
enframe(name = NULL) |> ## %>%
filter(nchar(value)>n) |> ## %>%
distinct() |> ## %>%
pull(value )
}
#######
remove_trailing_spaces <- function(string){
str_replace(gsub("\\s+", " ", str_trim(string)), "B", "b")
}
sort_all <- function(df){
res <- map_df(df, sort, na.last = T)
return(res)
}
## see https://stackoverflow.com/questions/18509527/first-letter-to-upper-case
first_up <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
read_utf16 <- function(filename){
res <- read.delim(filename, fileEncoding="UTF-16" ) |> ## %>%
as_tibble()
return(res)
}
## convert_to_latin <- function(df){
## res <- stri_trans_general(df, "latin-greek")
## return(res)
## }
####################################################
####################################################
####################################################
####################################################
##a function to move a row in a data frame.
## move_row <- function(df, ini_pos, fin_pos){
## row_pick <- slice(df, ini_pos)
## if (fin_pos=="last"){
## res <- df %>%
## slice(-ini_pos) %>%
## add_row(row_pick, .before = nrow(.))
## } else{
## res <- df %>%
## slice(-ini_pos) %>%
## add_row(row_pick, .before = fin_pos)
## }
## return(res)
## }
## Function rewritten with the native pipe
## See https://stackoverflow.com/a/72801062/2952838
move_row <- function(df, ini_pos, fin_pos){
row_pick <- slice(df, ini_pos)
if (fin_pos=="last"){
res <- df |>
slice(-ini_pos) |>
(\(x) add_row(x, row_pick, .before = nrow(x)+1))()
} else{
res <- df |>
slice(-ini_pos) |>
add_row(row_pick, .before = fin_pos)
}
return(res)
}
move_elem <- function(x, ini_pos, fin_pos){
df <- as_tibble(x)
res <- move_row(df, ini_pos, fin_pos) |> ## %>%
pull(value)
}
#### a function to hash all the rows in a data frame.
## See https://stackoverflow.com/questions/62184424/hashing-every-row-of-a-tibble
## I made a function relying on rlang to get the job done.
## hash_all_rows <- function(df, nn=hash){
## res <- df %>%
## mutate({{nn}} := pmap_chr(., ~ digest(c(...))))
## return(res)
## }
## an improvement with across()
hash_all_rows <- function(df, nn=hash){
if (class(df)[1]=="rowwise_df"){
res <- df %>%
mutate({{nn}} := digest(across(everything()))) |> ## %>%
ungroup()
} else {
res <- df %>% rowwise() |> ## %>%
mutate({{nn}} := digest(across(everything()))) |> ## %>%
ungroup()
}
return(res)
}
hash_all_rows_simple <- function(df, nn=hash){
res <- df |> ## %>%
unite({{nn}}, everything(), remove = FALSE)
return(res)
}
make_italics <- function(x){
res <- paste("\\textit{", x, "}", sep="" )
return(res)
}
make_bold <- function(x){
res <- paste("\\textbf{", x, "}", sep="" )
return(res)
}
## see https://stackoverflow.com/questions/39975317/how-to-reverse-the-order-of-a-dataframe-in-r
reverse_table <- function(df){
res <- df |> ## %>%
map_df(rev)
return(res)
}
## I slightly modify a function from janitor to do add the totals.
## add_total <- function(x, pos=1, ...){
## adorn_totals(x, ...) %>%
## as_tibble() %>%
## move_row(nrow(.)+1, pos)
## }
## I rewrite the function above using the native pipe.
## add_total <- function(x, pos=1, ...){
## adorn_totals(x, ...) |>
## as_tibble() |>
## (\(x) move_row(x, nrow(x)+1, pos))()
## }
add_total <- function(x, pos=1, ...){
x |> as_tibble() |>
adorn_totals( ...) |>
as_tibble() |>
(\(x) move_row(x, nrow(x)+1, pos))()
}
## See split(d, ceiling(seq_along(d)/20))
split_vec <- function(d, mylen){
res <- split(d, ceiling(seq_along(d)/mylen))
return(res)
}
#######################################################################
#######################################################################
#######################################################################
#######################################################################
#######################################################################
#######################################################################
### See https://github.com/inbo/inborutils/blob/master/R/csv_to_sqlite.R .
#' Save a delimited text table into a single table sqlite database
#'
#' The table can be a comma separated (csv) or a tab separated (tsv) or any
#' other delimited text file. The file is read in chunks. Each chunk is copied
#' in the same sqlite table database before the next chunk is loaded into
#' memory. See the INBO tutorial \href{https://github.com/inbo/tutorials/blob/master/source/data-handling/large-files-R.Rmd}{Handling large files in R}
#' to learn more about.
#'
#' @section Remark:
#' The \code{callback} argument in the \code{read_delim_chunked} function call
#' refers to the custom written callback function `append_to_sqlite` applied
#' to each chunk.
#'
#' @param csv_file Name of the text file to convert.
#' @param sqlite_file Name of the newly created sqlite file.
#' @param table_name Name of the table to store the data table in the sqlite
#' database.
#' @param delim Text file delimiter (default ",").
#' @param pre_process_size Number of lines to check the data types of the
#' individual columns (default 1000).
#' @param chunk_size Number of lines to read for each chunk (default 50000).
#' @param show_progress_bar Show progress bar (default TRUE).
#' @param ... Further arguments to be passed to \code{read_delim}.
#'
#' @return a SQLite database
#'
#' @examples
#' \dontrun{
#' library(R.utils)
#' library(dplyr)
#' csv.name <- "2016-04-20-processed-logs-big-file-example.csv"
#' db.name <- "2016-04-20-processed-logs-big-file-example.db"
#' # download the CSV file example
#' csv.url <- paste("https://s3-eu-west-1.amazonaws.com/lw-birdtracking-data/",
#' csv.name, ".gz", sep = "")
#' download.file(csv.url, destfile = paste0(csv.name, ".gz"))
#' gunzip(paste0(csv.name, ".gz"))
#' # Make a SQLite database
#' sqlite_file <- "example2.sqlite"
#' table_name <- "birdtracks"
#' csv_to_sqlite(csv_file = csv.name,
#' sqlite_file = sqlite_file,
#' table_name =table_name)
#' # Get access to SQLite database
#' my_db <- src_sqlite(sqlite_file, create = FALSE)
#' bird_tracking <- tbl(my_db, "birdtracks")
#' # Example query via dplyr
#' results <- bird_tracking %>%
#' filter(device_info_serial == 860) %>%
#' select(date_time, latitude, longitude, altitude) %>%
#' filter(date_time < "2014-07-01") %>%
#' filter(date_time > "2014-03-01") %>%
#' as_tibble()
#' head(results)
#' }
#' @export
#' @importFrom DBI dbConnect dbDisconnect
#' @importFrom RSQLite SQLite dbWriteTable
#' @importFrom readr read_delim read_delim_chunked
#' @importFrom dplyr %>% select_if mutate_at
#' @importFrom lubridate is.Date is.POSIXt
## csv_to_sqlite <- function(csv_file, sqlite_file, table_name,
## delim = ",",
## pre_process_size = 1000, chunk_size = 50000,
## show_progress_bar = TRUE, ...) {
## con <- dbConnect(SQLite(), dbname = sqlite_file)
## # read a first chunk of data to extract the colnames and types
## # to figure out the date and the datetime columns
## df <- read_delim(csv_file, delim = delim, n_max = pre_process_size, ...)
## date_cols <- df %>%
## select_if(is.Date) %>%
## colnames()
## datetime_cols <- df %>%
## select_if(is.POSIXt) %>%
## colnames()
## # write the first batch of lines to SQLITE table, converting dates to string
## # representation
## df <- df %>%
## mutate_at(.vars = date_cols, .funs = as.character.Date) %>%
## mutate_at(.vars = datetime_cols, .funs = as.character.POSIXt)
## dbWriteTable(con, table_name, df, overwrite = TRUE)
## # readr chunk functionality
## read_delim_chunked(
## csv_file,
## callback = append_to_sqlite(con = con, table_name = table_name,
## date_cols = date_cols,
## datetime_cols = datetime_cols),
## delim = delim,
## skip = pre_process_size, chunk_size = chunk_size,
## progress = show_progress_bar,
## col_names = colnames(df), ...)
## dbDisconnect(con)
## }
#' Callback function that appends new sections to the SQLite table.
#' @param con A valid connection to SQLite database.
#' @param table_name Name of the table to store the data table in the sqlite
#' database.
#' @param date_cols Name of columns containing Date objects
#' @param datetime_cols Name of columns containint POSIXt objects.
#'
#' @keywords internal
## append_to_sqlite <- function(con, table_name,
## date_cols, datetime_cols) {
## #' @param x Data.frame we are reading from.
## function(x, pos) {
## x <- as.data.frame(x)
## x <- x %>%
## mutate_at(.vars = date_cols, .funs = as.character.Date) %>%
## mutate_at(.vars = datetime_cols, .funs = as.character.POSIXt)
## # append data frame to table
## dbWriteTable(con, table_name, x, append = TRUE)
## }
## }
### A more modern version of the functions above
csv_to_sqlite <- function(csv_file, sqlite_file, table_name,
delim = ",",
pre_process_size = 1000, chunk_size = 50000,
show_progress_bar = TRUE, ...) {
con <- dbConnect(SQLite(), dbname = sqlite_file)
# read a first chunk of data to extract the colnames and types
# to figure out the date and the datetime columns
df <- read_delim(csv_file, delim = delim, n_max = pre_process_size, ...)
date_cols <- df |>
select(where(is.Date)) |>
colnames()
datetime_cols <- df |>
select(where(is.POSIXt)) |>
colnames()
# write the first batch of lines to SQLITE table, converting dates to string
# representation
## df <- df %>%
## mutate_at(.vars = date_cols, .funs = as.character.Date) %>%
## mutate_at(.vars = datetime_cols, .funs = as.character.POSIXt)
df <- df |>
mutate(across( all_of(date_cols), \(x) as.character.Date(x))) |>
mutate(across(all_of(datetime_cols), \(x) as.character.POSIXt(x)))
dbWriteTable(con, table_name, df, overwrite = TRUE)
# readr chunk functionality
read_delim_chunked(
csv_file,
callback = append_to_sqlite(con = con, table_name = table_name,
date_cols = date_cols,
datetime_cols = datetime_cols),
delim = delim,
skip = pre_process_size, chunk_size = chunk_size,
progress = show_progress_bar,
col_names = colnames(df), ...)
dbDisconnect(con)
}
append_to_sqlite <- function(con, table_name,
date_cols, datetime_cols) {
#' @param x Data.frame we are reading from.
function(x, pos) {
x <- as.data.frame(x)
x <- x |>
mutate(across( all_of(date_cols), \(x) as.character.Date(x))) |>
mutate(across( all_of(datetime_cols), \(x) as.character.POSIXt(x)))
# append data frame to table
dbWriteTable(con, table_name, x, append = TRUE)
}
}
#######################################################################
#######################################################################
#######################################################################
#######################################################################
## see https://stackoverflow.com/questions/43627679/round-any-equivalent-for-dplyr
round_any = function(x, accuracy, f=round){f(x/ accuracy) * accuracy}
### a function to determine the bond positions in an aggregate
##The old function used a combination of mutate_all and across which has been
## superseeded.
## bond_positions <- function(g, agg){
## edge_list <- get.edgelist(g) %>%
## as_tibble(.name_repair = "unique") %>%
## set_names("vertex1", "vertex2") %>%
## ## mutate_all(as.integer)
## mutate_all(across(everything()),as.integer)
## pos1 <- agg[edge_list$vertex1,]
## pos2 <- agg[edge_list$vertex2,]
## pos_bond <- ((pos1+pos2)/2) %>% as_tibble
## return(pos_bond)
## }
bond_positions <- function(g, agg){
edge_list <- get.edgelist(g) |> ## %>%
as_tibble(.name_repair = "unique") |> ## %>%
set_names("vertex1", "vertex2") |> ## %>%
## mutate_all(as.integer)
mutate(across(everything(),\(x) as.integer(x)))
pos1 <- agg[edge_list$vertex1,]
pos2 <- agg[edge_list$vertex2,]
pos_bond <- ((pos1+pos2)/2) |> ## %>%
as_tibble()
return(pos_bond)
}
### a function to find all the the distances of a set of points (e.g. bonds
## positions) to another point (e.g. the center of mass) within a threshold
find_positions_near <- function(pos_bond, cm, threshold){
require(Rfast)
dist_bonds_from_cm <- dista(as.matrix(pos_bond), as.matrix(cm))
norm_dist_bonds_from_cm <- dist_bonds_from_cm |> ## %>%
scale01()
## dist_bonds_from_cm <- dista(as.matrix(pos_bond), as.matrix(cm))
res <- which(norm_dist_bonds_from_cm<=threshold)
return(res)
}
find_positions_near2 <- function(pos_bond, cm, threshold){
require(Rfast)
dist_bonds_from_cm <- dista(as.matrix(pos_bond), as.matrix(cm))
res <- select_chunk(dist_bonds_from_cm, 0, threshold)
return(res)
}
#### function to fragment a cluster according to the edge betweenness
fragment_cluster_prob_new <- function(g, n_rep){
n_links=E(g)
mysamp <- seq(n_links)
probs <- edge_betweenness(g, directed=F)
n_rem <- ecount(g)*n_rep
myseq <- sample(mysamp, n_rem,replace=T, probs)
## print("myseq is, ")
## print(myseq)
frag_dist <- c()
for (i in myseq){
g_broken <- delete.edges(g,i)
fragments <- clusters(g_broken)$csize
if(length(fragments)==1){
fragments=c(0, fragments)
## print("No frag!!!")
}
fragments_tibble <- tibble(n1=fragments[1], n2=fragments[2])
## print(fragments_tibble)
frag_dist <- bind_rows(frag_dist, fragments_tibble)
}
frag_dist <- frag_dist |> ## %>%
mutate(e1=if_else(n1>=n2, n1, n2),
e2=if_else(n1<=n2, n1, n2)) |> ## %>%
mutate(n1=e1,n2=e2) |> ## %>%
select(-c(e1,e2))
return(frag_dist)
}
##simple function to cap outliers. See https://stackoverflow.com/questions/13339685/how-to-replace-outliers-with-the-5th-and-95th-percentile-values-in-r
capOutlier <- function(x){
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(x, na.rm = T)
x[x < (qnt[1] - H)] <- caps[1]
x[x > (qnt[2] + H)] <- caps[2]
return(x)
}
detectOutlier <- function(x){
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(x, na.rm = T)
is_outlier <- rep("no", length(x))
is_outlier[x < (qnt[1] - H)] <- "low"
is_outlier[x > (qnt[2] + H)] <- "high"
return(is_outlier)
}
time_diff_years <- function(dob, today){
res <- interval(dob, today) / years(1)
return(res)
}
##############################################
## a function to specify the number of cores in furrr
return_cores <- function(x){
return(x)
}
## save_excel <- function(df, filename, sheet_name="data", ...){
## write.xlsx(df, filename,
## sheetName = sheet_name, row.names=F,keepNA = TRUE, ...)
## }
## A better version of the function above
## See https://github.com/awalker89/openxlsx/issues/157
save_excel <- function(output, fileName, sheetName="data", na_yes = TRUE,...){
tryCatch({
wb <- loadWorkbook(fileName)
addWorksheet(wb = wb, sheet = sheetName)
writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F,
keepNA=na_yes ,...)
saveWorkbook(wb = wb, file = fileName, overwrite = T)
},
error = function(err){
wb <- createWorkbook(fileName)
addWorksheet(wb = wb, sheet = sheetName)
writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, keepNA=na_yes , ... )
saveWorkbook(wb = wb, file = fileName, overwrite = T)
})
}
## save_excel <- function(output, fileName, sheetName="data",...){
## tryCatch({
## wb <- loadWorkbook(fileName)
## writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F,
## keepNA=T ,...)
## saveWorkbook(wb = wb, file = fileName, overwrite = T)
## },
## error = function(err){
## wb <- createWorkbook(fileName)
## addWorksheet(wb = wb, sheetName = sheetName)
## writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, keepNA=T , ... )
## saveWorkbook(wb = wb, file = fileName, overwrite = T)
## })
## }
clean_data <- function(x, remove_duplicated_rows=T){
res <- x |> ## %>%
clean_names() |> ## %>%
remove_empty() |> ## %>%
rem_dupl_cols() |> ## %>%
rem_const_cols()
if (remove_duplicated_rows==T){
res <- res |>
distinct()
}
return(res)
}
clean_data_keep_const <- function(x){
res <- x |> ## %>%
clean_names() |> ## %>%
remove_empty() |> ## %>%
distinct() |> ## %>%
rem_dupl_cols() ## %>%
## rem_const_cols()
return(res)
}
## See https://stackoverflow.com/questions/22058393/convert-a-numeric-month-to-a-month-abbreviation . Convert integers to months
to_month <- function(m){
require(lubridate)
res <- as.character(month(ymd(010101) + months(m-1),label=TRUE,abbr=TRUE))
return(res)
}
get_dupes_short <- function(df, ...){
require(janitor)
res <- df |> ## %>%
get_dupes(...) |> ## %>%
distinct()
return(res)
}
join_replace <- function(val_ini, x, val_new, x2 ){
## require(magrittr)
df1 <- tibble(value=val_ini, x_pos=x)
df2 <- tibble(value_new=val_new, x_pos=x2)
df_out <- df1 |> ## %>%
left_join(y=df2) |> ## %>%
mutate(value_new=if_else(is.na(value_new), value, value_new)) |> ## %>%
pull(value_new)
}
bind_df_list <- function(ll){
res <- map_df(ll, bind_rows)
return(res)
}
## See https://stackoverflow.com/questions/66534701/r-bind-rows-fails-because-of-unnamed-argument-1
bind_vec_list <- function(ll){
res <- map_dfr(ll, as.data.frame.list)
return(res)
}
bind_df_list_future <- function(ll){
res <- future_map_dfc(ll, bind_rows)
return(res)
}
#a function to take the diff of a column vector and obtain a vector of the same length
diff_col <- function(x){
res <- c(NA, diff(x))
return(res)
}
### Now using the first element of x to start with
diff_col2 <- function(x){
res <- c(x[1], diff(x))
return(res)
}
## function to remove all the spaces from a string
remove_all_spaces <- function(x){
res <- str_replace_all(x, " ", "")
res
}
## see https://www.rdocumentation.org/packages/zoo/versions/1.8-8/topics/na.approx
interpolate_vec <- function(x){
require(forecast)
res <- na.interp(x) |> ## %>%
as.vector()
return(res)
}
## functions to find even and odd numbers
is_odd <- function(x){
res <- x %% 2 == 1
return(res)
}
is_even <- function(x){
res <- x %% 2 == 0
return(res)
}
find_odd <- function(x){
res <- which(is_odd(x))
return(res)
}
find_even <- function(x){
res <- which(is_even(x))
return(res)
}
## a function to set to zero part of a distribution
## reset_quantiles <- function(x,qq, top_bottom){
## ## top_bottom <- "top"
## if (top_bottom=="top"){
## cut_off <- quantile(x, 1-qq)
## x[x<cut_off] <- 0
## } else if (top_bottom=="bottom"){
## cut_off <- quantile(x, qq)
## x[x>cut_off] <- 0
## } else{
## print("error!!!")
## }
## return(x)
## }
reset_quantiles <- function(x, low, high){
## top_bottom <- "top"
cut_low <- quantile(x, )
if (top_bottom=="top"){
cut_off <- quantile(x, 1-qq)
x[x<cut_off] <- 0
} else if (top_bottom=="bottom"){
cut_off <- quantile(x, qq)
x[x>cut_off] <- 0
} else{
print("error!!!")
}
return(x)
}
## See https://stackoverflow.com/questions/28709331/logarithmic-grid-for-plot-with-ggplot2
## scientific_10 <- function(x) {
## xout <- gsub("1e", "10^{", format(x),fixed=TRUE)
## xout <- gsub("{-0", "{-", xout,fixed=TRUE)
## xout <- gsub("{+", "{", xout,fixed=TRUE)
## xout <- gsub("{0", "{", xout,fixed=TRUE)
## xout <- paste(xout,"}",sep="")
## return(parse(text=xout))
## }
## This one works! See https://stackoverflow.com/questions/10762287/how-can-i-format-axis-labels-with-exponents-with-ggplot2-and-scales/45867076#45867076
scientific_10 = function(x) {
ifelse(
x==0, "0",
parse(text = sub("e[+]?", " %*% 10^", scientific_format()(x)))
)
}
## scale_x_log10nice <- function(name=NULL,omag=seq(-10,20),...) {
## breaks10 <- 10^omag
## scale_x_log10(name,breaks=breaks10,labels=scientific_10(breaks10),...)
## }
## scale_y_log10nice <- function(name=NULL,omag=seq(-10,20),...) {
## breaks10 <- 10^omag
## scale_y_log10(name,breaks=breaks10,labels=scientific_10(breaks10),...)
## }
scale_x_log10nice <- function(name=NULL,omag=seq(-20,20),...) {
breaks10 <- 10^omag
scale_x_log10(name,breaks=breaks10,
labels=scales::trans_format("log10", scales::math_format(10^.x)),...)
}
scale_y_log10nice <- function(name=NULL,omag=seq(-20,20),...) {
breaks10 <- 10^omag
scale_y_log10(name,breaks=breaks10,
labels=scales::trans_format("log10", scales::math_format(10^.x)),...)
}
scale_loglog <- function(...) {
list(scale_x_log10nice(...),scale_y_log10nice(...))
}
## See also https://waterdata.usgs.gov/blog/boxplots/
prettyLogs <- function(x){
pretty_range <- range(x[x > 0])
pretty_logs <- 10^(-10:10)
log_index <- which(pretty_logs < pretty_range[2] &
pretty_logs > pretty_range[1])
log_index <- c(log_index[1]-1,log_index, log_index[length(log_index)]+1)
pretty_logs_new <- pretty_logs[log_index]
return(pretty_logs_new)
}
fancyNumbers <- function(n){
nNoNA <- n[!is.na(n)]
x <-gsub(pattern = "1e",replacement = "10^",
x = format(nNoNA, scientific = TRUE))
exponents <- as.numeric(sapply(strsplit(x, "\\^"), function(j) j[2]))
base <- ifelse(exponents == 0, "1", ifelse(exponents == 1, "10","10^"))
exponents[base == "1" | base == "10"] <- ""
textNums <- rep(NA, length(n))
textNums[!is.na(n)] <- paste0(base,exponents)
textReturn <- parse(text=textNums)
return(textReturn)
}
##################################################################
##################################################################
##################################################################
##################################################################
select_chunk <- function(x, low, high){
if (low<0 | high<0 | low>1 | high>1 | low >=high){
print("error in the definition of high and low")
} else {
ss <- sort(x, index.return=T, decreasing=F)
n <- length(x)
low_cut <- round(n*low, 0)
high_cut <- round(n*high, 0)
res <- ss$ix[low_cut:high_cut]
}
}
select_chunk_reset <- function(x, low, high){
chunk <- select_chunk(x, low, high)
## print("chunk is, ")
## print(chunk)
x[-chunk] <- 0
return(x)
}
extract_digits <- function(x, ini, end ){
## options(scipen=999)
x <- format(x, scientific = F, trim=T)
res <- as.numeric(substr(x, ini, end))
## options(scipen=0)
return(res)
}
attribute_month <- function(x){
mm <- tibble(month_num=seq(12), month_name=month.abb)
res <- mm |> ## %>%
filter(month_num %in% x) |> ## %>%
pull(month_name)
return(res)
}
## see https://www.r-bloggers.com/2018/11/benfords-law-for-fraud-detection-with-an-application-to-all-brazilian-presidential-elections-from-2002-to-2018/
return_benford1 <- function(){
benford1 <- log10(1+1/1:9) ## benford first digit
return(benford1)
}
return_benford12 <- function(){
benford12 = log10(1+1/10:99) ## benford 2 digits
return(benford12)
}
return_benford2 <- function(){
benford1 <- log10(1+1/1:9) ## benford first digit
benford12 = log10(1+1/10:99) ## benford 2 digits
benford2 <- tibble(v1=benford12, v2=(10:99 - floor(10:99/10)*10)) |> ## %>%
group_by(v2) |> ## %>%
summarise(v1=sum(v1)) |> ## %>%
pull(v1) ## benford second digit
return(benford2)
}
##calculation of the Herfindahl–Hirschman Index
hhi_index <- function(x){
y <- x/sum(x)
res <- sum(y^2)
return(res)
}
hhi_index_norm <- function(x){
n <- length(x)
y <- x/sum(x)
res <- sqrt(sum(y^2))-sqrt(1/n)
res <- res/(1-sqrt(1/n))
return(res)
}
## See https://stackoverflow.com/questions/63181058/tidy-functional-way-of-interweaving-rows-of-a-data-frame
interleave_rows <- function(df_a, df_b) {
if(nrow(df_b) > nrow(df_a)) return(interleave_rows_tidy(df_b, df_a))
a <- df_a %>% nrow %>% seq
b <- df_b %>% nrow %>% seq
bind_rows(df_a, df_b) %>% arrange(c(a, length(a) * b/(length(b) + 1)))
}
## See https://stackoverflow.com/questions/39778093/how-to-increase-smoothness-of-spheres3d-in-rgl
sphere1.f <- function(x0 = 0, y0 = 0, z0 = 0, r = 1, n = 101, ...){
f <- function(s,t){
cbind( r * cos(t)*cos(s) + x0,
r * sin(s) + y0,
r * sin(t)*cos(s) + z0)
}
persp3d(f, slim = c(-pi/2,pi/2), tlim = c(0, 2*pi), n = n, add = T, ...)
}
## see https://stackoverflow.com/questions/1358003/tricks-to-manage-the-available-memory-in-an-r-session
# improved list of objects
.ls.objects <- function (pos = 1, pattern, order.by,
decreasing=FALSE, head=FALSE, n=5) {
napply <- function(names, fn) sapply(names, function(x)
fn(get(x, pos = pos)))
names <- ls(pos = pos, pattern = pattern)
obj.class <- napply(names, function(x) as.character(class(x))[1])
obj.mode <- napply(names, mode)
obj.type <- ifelse(is.na(obj.class), obj.mode, obj.class)
obj.prettysize <- napply(names, function(x) {
format(utils::object.size(x), units = "auto") })
obj.size <- napply(names, object.size)
obj.dim <- t(napply(names, function(x)
as.numeric(dim(x))[1:2]))
vec <- is.na(obj.dim)[, 1] & (obj.type != "function")
obj.dim[vec, 1] <- napply(names, length)[vec]
out <- data.frame(obj.type, obj.size, obj.prettysize, obj.dim)
names(out) <- c("Type", "Size", "PrettySize", "Length/Rows", "Columns")
if (!missing(order.by))
out <- out[order(out[[order.by]], decreasing=decreasing), ]
if (head)
out <- head(out, n)
out
}
# shorthand
objects_memory <- function(..., n=10) {
.ls.objects(..., order.by="Size", decreasing=TRUE, head=TRUE, n=n)
}
#######################################################################
## see https://stackoverflow.com/questions/13594223/number-values-include-comma-how-do-i-make-these-numeric
comma_sep_to_numeric <- function(x){
res <- as.numeric(gsub(",", "", as.character(x)))
return(res)
}
convert_to_utf_all <- function(x){
## res <- x %>% mutate(across(where(is.character), ~convert_to_utf(.x)))
res <- x |>
mutate(across(where(is.character), \(x) convert_to_utf(x)))
return(res)
}
convert_to_ascii_all <- function(x){
## res <- x %>% mutate(across(where(is.character), ~latin_to_ascii(.x)))
res <- x |>
mutate(across(where(is.character), \(x) latin_to_ascii(x)))
return(res)
}
###I added a function to perform a set of standard operations when pivoting and
### adding totals along the years. It also relies on the curly-curly syntax.
## make_wide_with_total <- function(x, var1_name, var2_quantity, var_out="All years",
## total_name="Total",pos="bottom", ...){
## if (pos=="bottom"){
## res <- x %>%
## pivot_wider(names_from=all_of(var1_name),
## values_from=all_of(var2_quantity),
## names_sort = T ) %>%
## na_to_pattern(0) %>%
## add_total(nrow(.)+1, name = total_name, ...) %>%
## rowwise() %>%
## mutate({{var_out}}:=sum(c_across(where(is.numeric)))) %>%
## ungroup()
## } else if (pos=="top"){
## res <- x %>%
## pivot_wider(names_from=all_of(var1_name),
## values_from=all_of(var2_quantity),
## names_sort = T ) %>%
## na_to_pattern(0) %>%
## add_total(1, name = total_name, ...) %>%
## rowwise() %>%
## mutate({{var_out}}:=sum(c_across(where(is.numeric)))) %>%
## ungroup()
## }
## return(res)
## }
## I rewrite the function above using the native pipe.
make_wide_with_total <- function(x, var1_name, var2_quantity, var_out="All years",
total_name="Total",pos="bottom", ...){
if (pos=="bottom"){
res <- x |>
pivot_wider(names_from=all_of(var1_name),
values_from=all_of(var2_quantity),
names_sort = T ) |>
na_to_pattern(0) |>
## add_total(nrow(.)+1, name = total_name, ...) |>
(\(x) add_total(x, nrow(x)+1, name=total_name, ...))() |>
rowwise() |>
mutate({{var_out}}:=sum(c_across(where(is.numeric)))) |>
ungroup()
} else if (pos=="top"){
res <- x |>
pivot_wider(names_from=all_of(var1_name),
values_from=all_of(var2_quantity),
names_sort = T ) |>
na_to_pattern(0) |>
add_total(1, name = total_name, ...) |>
rowwise() |>
mutate({{var_out}}:=sum(c_across(where(is.numeric)))) |>
ungroup()
}
return(res)
}
## a function to calculate the total on grouped data
## See https://stackoverflow.com/questions/61178470/dplyr-group-modify-and-anonymous-functions/61178540#61178540
add_total_by_group <- function(df_grouped, ...){
## res <- df_grouped %>%
## group_modify(~ .x %>%
## adorn_totals("row"))
res <- df_grouped |>
group_modify(\(x,y) adorn_totals(x,"row"))
return(res)
}
add_total_by_group_old <- function(df_grouped, ...){
res <- df_grouped %>%
group_modify(~ .x %>%
adorn_totals("row"))
return(res)
}
library(dplyr)
iso_map_eu28 <- tibble(iso3=c("AUT", "BEL", "BGR", "CYP", "CZE", "DEU", "DNK", "ESP", "EST", "FIN",
"FRA",
"GRC", "HRV", "HUN", "IRL", "ITA", "LTU", "LUX", "LVA", "MLT",
"NLD", "POL", "PRT",
"ROM", "SVK", "SVN", "SWE", "GBR"),
iso2 = c("AT", "BE", "BG", "CY", "CZ","DE",
"DK", "ES" ,"EE", "FI", "FR", "EL", "HR", "HU", "IE",
"IT" , "LT","LU", "LV", "MT", "NL", "PL",
"PT", "RO", "SK", "SI", "SE", "UK"),
country=c("Austria", "Belgium", "Bulgaria", "Cyprus", "Czechia", "Germany",
"Denmark", "Spain", "Estonia", "Finland", "France", "Greece",
"Croatia", "Hungary", "Ireland","Italy", "Lithuania", "Luxembourg",
"Latvia", "Malta", "Netherlands", "Poland", "Portugal",
"Romania", "Slovakia", "Slovenia", "Sweden", "United Kingdom")
)
iso_map_eu27 <- tibble(iso3=c("AUT", "BEL", "BGR", "CYP", "CZE", "DEU", "DNK", "ESP", "EST", "FIN",
"FRA",
"GRC", "HRV", "HUN", "IRL", "ITA", "LTU", "LUX", "LVA", "MLT",
"NLD", "POL", "PRT",
"ROM", "SVK", "SVN", "SWE"),
iso2 = c("AT", "BE", "BG", "CY", "CZ","DE",
"DK", "ES" ,"EE", "FI", "FR", "EL", "HR", "HU", "IE",
"IT" , "LT","LU", "LV", "MT", "NL", "PL",
"PT", "RO", "SK", "SI", "SE"),
country=c("Austria", "Belgium", "Bulgaria", "Cyprus", "Czechia", "Germany",
"Denmark", "Spain", "Estonia", "Finland", "France", "Greece",
"Croatia", "Hungary", "Ireland","Italy", "Lithuania", "Luxembourg",
"Latvia", "Malta", "Netherlands", "Poland", "Portugal",
"Romania", "Slovakia", "Slovenia", "Sweden")
)
################################################################################################################################################################################################################################################################################################################################################################################################################################################
## Extra functions to extract strings. See
## https://github.com/johncassil/stringr.plus/tree/main/R
str_extract_after <- function(string, pattern, num_char = NULL){
position_of_pattern <- stringr::str_locate(string=string, pattern = pattern)
end_of_pattern <- position_of_pattern[,'end']
string_extract <- stringr::str_sub(string = string,
start = (end_of_pattern + 1),
end = if (is.null(num_char)) {nchar(string)} else {(end_of_pattern + num_char)})
return(string_extract)
}
str_extract_after_date <- function(string, date_sep = "", format = "num", num_char = NULL){
if (format == "num") {
position_of_pattern <- stringr::str_locate(string=string, pattern = glue::glue("[:punct:][0-9{date_sep}]{{8,}}"))
}
else if (format == "mdy-abbr") {
position_of_pattern <- stringr::str_locate(string=string, pattern = glue::glue("([:alpha:]{{3}}{date_sep}\\d{{1,2}}{date_sep}\\d{{4}}[:punct:])"))
}
else if (format == "mdy-full") {
position_of_pattern <- stringr::str_locate(string=string, pattern = glue::glue("([:alpha:]{{3,}}{date_sep}\\d{{1,2}}{date_sep}\\d{{4}}[:punct:])"))
}
else if (format == "dmy-abbr") {
position_of_pattern <- stringr::str_locate(string=string, pattern = glue::glue("(\\d{{1,2}}{date_sep}[:alpha:]{{3}}{date_sep}\\d{{4}}[:punct:])"))
}
else if (format == "dmy-full") {
position_of_pattern <- stringr::str_locate(string=string, pattern = glue::glue("(\\d{{1,2}}{date_sep}[:alpha:]{{3,}}{date_sep}\\d{{4}}[:punct:])"))
}
else {
stop("category must be one of 'num', 'mdy-abbr', 'mdy-full', 'dmy-abbr', or 'dmy-full'")
}
end_of_pattern <- position_of_pattern[,"end"]
string_extract <- stringr::str_sub(string = string,
start = (end_of_pattern + 1 ),
end = if (is.null(num_char)) {nchar(string)} else {(end_of_pattern + num_char)})
return(string_extract)
}
str_extract_before <- function(string, pattern, num_char = NULL){
position_of_pattern <- stringr::str_locate(string=string, pattern = pattern)
start_of_pattern <- position_of_pattern[,"start"]
string_extract <- stringr::str_sub(string = string,
start = if (is.null(num_char)) {0} else {(start_of_pattern - num_char)},
end = (start_of_pattern - 1) )
return(string_extract)
}
str_extract_before_date <- function(string, date_sep = "", format = "num", num_char = NULL){
if (format == "num") {
position_of_pattern <- stringr::str_locate(string=string, pattern = glue::glue("[:punct:][0-9{date_sep}]{{8,}}"))
}
else if (format == "mdy-abbr") {
position_of_pattern <- stringr::str_locate(string=string, pattern = glue::glue("([:punct:][:alpha:]{{3}}{date_sep}\\d{{1,2}}{date_sep}\\d{{4}})"))
}
else if (format == "mdy-full") {
position_of_pattern <- stringr::str_locate(string=string, pattern = glue::glue("([:punct:][:alpha:]{{3,}}{date_sep}\\d{{1,2}}{date_sep}\\d{{4}})"))
}
else if (format == "dmy-abbr") {
position_of_pattern <- stringr::str_locate(string=string, pattern = glue::glue("([:punct:]\\d{{1,2}}{date_sep}[:alpha:]{{3}}{date_sep}\\d{{4}})"))
}
else if (format == "dmy-full") {
position_of_pattern <- stringr::str_locate(string=string, pattern = glue::glue("([:punct:]\\d{{1,2}}{date_sep}[:alpha:]{{3,}}{date_sep}\\d{{4}})"))
}
else {
stop("category must be one of 'num', 'mdy-abbr', 'mdy-full', 'dmy-abbr', or 'dmy-full'")
}
start_of_pattern <- position_of_pattern[,"start"]
string_extract <- stringr::str_sub(string = string,
start = if (is.null(num_char)) {0} else {(start_of_pattern - num_char)},
end = (start_of_pattern - 1) )
return(string_extract)
}
str_extract_between <- function(string, pattern1, pattern2){
position_of_pattern1 <- stringr::str_locate(string=string, pattern = pattern1)
end_of_pattern1 <- position_of_pattern1[,"end"]
position_of_pattern2 <- stringr::str_locate(string=string, pattern = pattern2)
start_of_pattern2 <- position_of_pattern2[,"start"]
string_extract <- stringr::str_sub(string = string, start = (end_of_pattern1 + 1), end = (start_of_pattern2 - 1) )
return(string_extract)
}
str_extract_context <- function(string, pattern, window_size = 10){
positions_of_pattern <- stringr::str_locate(string = string, pattern = pattern)
string_extract_all <- rep(character(0), nrow(positions_of_pattern))
window_size <- if(length(window_size) == 1) {
rep(window_size, length(positions_of_pattern))
} else {
window_size
}
for(i in 1:nrow(positions_of_pattern)){
start_of_pattern <- positions_of_pattern[i,"start"]
end_of_pattern <- positions_of_pattern[i,"end"]
string_extract <- stringr::str_sub(string = string[i],
start = max(0,
start_of_pattern - window_size[i]),
end = min(nchar(string[i]),
end_of_pattern + window_size[i])
)
string_extract_all[i] <- string_extract
}
return(string_extract_all)
}
str_extract_context_all <- function(string, pattern, window_size = 10){
positions_of_pattern <- stringr::str_locate_all(string = string, pattern = pattern)
window_size <- if(length(window_size) == 1) {
rep(window_size, length(positions_of_pattern))
} else {
window_size
}
extract_per_string <- function(string, positions_of_pattern, window_size){
positions_of_pattern <- as.data.frame(positions_of_pattern)
string_extract_all <- rep(character(0), nrow(positions_of_pattern))
if(nrow(positions_of_pattern) > 0){
for(i in 1:nrow(positions_of_pattern)){
start_of_pattern <- positions_of_pattern[i,"start"]
end_of_pattern <- positions_of_pattern[i,"end"]
string_extract <- stringr::str_sub(string = string,
start = max(0,
start_of_pattern - window_size),
end = min(nchar(string),
end_of_pattern + window_size)
)
string_extract_all[i] <- string_extract
}
}
return(string_extract_all)
}
return_list <- unname(mapply(extract_per_string, string, positions_of_pattern, window_size))
return(return_list)
}
## str_detect_multiple <- function(string, patterns, method){
## if(method == 'and'){
## all(stringr::str_detect(string, patterns))
## } else if (method == 'or'){
## any(stringr::str_detect(string, patterns))
## } else {stop("argument 'method' must be either 'or' or 'and'")}
## }
## str_detect_multiple_and <- function(string, patterns){
## str_detect_multiple(string, patterns, method = "and")
## }
## str_detect_multiple_or <- function(string, patterns){
## str_detect_multiple(string, patterns, method = "or")
## }
## See https://stackoverflow.com/questions/44759180/filter-by-multiple-patterns-with-filter-and-str-detect
str_detect_multiple <- function(x,tt){
tt2 <- paste0(tt, collapse="|")
res <- str_detect(x,tt2)
return(res)
}
str_detect_multiple_case_insensitive <- function(x,tt){
tt2 <- paste0(tt, collapse="|(?i)")
res <- str_detect(x,tt2)
return(res)
}
##See https://stackoverflow.com/questions/67601507/r-using-dplyr-to-find-and-filter-for-a-string-in-a-whole-data-frame
## I also created some version where only exact matches will be found.
## i.e. if I look for "ab" in ("ab", "abc", "abcde") only the first element of
## the vector return true.
find_text <- function(df, tt){
## res <- df %>%
## mutate(across(where(is.character), ~str_detect(.x, tt)))
res <- df |>
mutate(across(where(is.character), \(x) str_detect(x, tt)))
return(res)
}
find_text_exact_matches <- function(df, tt){
## res <- df %>%
## mutate(across(where(is.character), ~find_exact_matches(tt,.x)))
res <- df |>
mutate(across(where(is.character), \(x) find_exact_matches(tt,x)))
return(res)
}
find_text_filter <- function(df, tt){
## res <- df %>%
## filter(if_any(where(is.character), ~str_detect(.x, tt)))
res <- df |>
filter(if_any(where(is.character), \(x) str_detect(x, tt)))
return(res)
}
find_text_filter_multiple <- function(df, tt){
## res <- df %>%
## filter(if_any(where(is.character), ~str_detect_multiple(.x, tt)))
res <- df |>
filter(if_any(where(is.character), \(x) str_detect_multiple(x, tt)))
return(res)
}
find_text_filter_exact_matches_col <- function(df, xx, tt){
res <- df |>
filter( find_exact_matches(tt,{{xx}}))
return(res)
}
## Same as before, but for a column. Notice the curly-curly to be able to
## pass the name of the column.
find_text_filter_col <- function(df,xx, tt){
res <- df |> ## %>%
filter(str_detect({{xx}}, tt))
return(res)
}
find_text_filter_exact_matches <- function(df, tt){
## res <- df %>%
## filter(if_any(where(is.character), ~find_exact_matches(tt,.x)))
res <- df |>
filter(if_any(where(is.character), \(x) find_exact_matches(tt,x)))
return(res)
}
find_text_replace <- function(df, tt, new_tt){
## res <- df %>%
## mutate(across(where(is.character), ~str_replace(.x,tt, new_tt)))
res <- df |>
mutate(across(where(is.character), \(x) str_replace(x,tt, new_tt)))
return(res)
}
find_text_replace_exact_matches <- function(df, tt, new_tt){
## res <- df %>%
## mutate(across(where(is.character),~replace_exact_matches(tt,.x, new_tt)) )
res <- df |>
mutate(across(where(is.character),\(x) replace_exact_matches(tt, x, new_tt)) )
return(res)
}
### aliases from magrittr
#' Aliases
#'
#' magrittr provides a series of aliases which can be more pleasant to use
#' when composing chains using the \code{\%>\%} operator.
#'
#' Currently implemented aliases are
#' \tabular{ll}{
#' \code{extract} \tab \code{`[`} \cr
#' \code{extract2} \tab \code{`[[`} \cr
#' \code{inset} \tab \code{`[<-`} \cr
#' \code{inset2} \tab \code{`[[<-`} \cr
#' \code{use_series} \tab \code{`$`} \cr
#' \code{add} \tab \code{`+`} \cr
#' \code{subtract} \tab \code{`-`} \cr
#' \code{multiply_by} \tab \code{`*`} \cr
#' \code{raise_to_power} \tab \code{`^`} \cr
#' \code{multiply_by_matrix} \tab \code{`\%*\%`} \cr
#' \code{divide_by} \tab \code{`/`} \cr
#' \code{divide_by_int} \tab \code{`\%/\%`} \cr
#' \code{mod} \tab \code{`\%\%`} \cr
#' \code{is_in} \tab \code{`\%in\%`} \cr
#' \code{and} \tab \code{`&`} \cr
#' \code{or} \tab \code{`|`} \cr
#' \code{equals} \tab \code{`==`} \cr
#' \code{is_greater_than} \tab \code{`>`} \cr
#' \code{is_weakly_greater_than} \tab \code{`>=`} \cr
#' \code{is_less_than} \tab \code{`<`} \cr
#' \code{is_weakly_less_than} \tab \code{`<=`} \cr
#' \code{not} (\code{`n'est pas`}) \tab \code{`!`} \cr
#' \code{set_colnames} \tab \code{`colnames<-`} \cr
#' \code{set_rownames} \tab \code{`rownames<-`} \cr
#' \code{set_names} \tab \code{`names<-`} \cr
#' \code{set_class} \tab \code{`class<-`} \cr
#' \code{set_attributes} \tab \code{`attributes<-`} \cr
#' \code{set_attr } \tab \code{`attr<-`} \cr
#' }
#'
#' @usage NULL
#' @export
#' @rdname aliases
#' @name extract
#' @examples
#' iris %>%
#' extract(, 1:4) %>%
#' head
#'
#' good.times <-
#' Sys.Date() %>%
#' as.POSIXct %>%
#' seq(by = "15 mins", length.out = 100) %>%
#' data.frame(timestamp = .)
#'
#' good.times$quarter <-
#' good.times %>%
#' use_series(timestamp) %>%
#' format("%M") %>%
#' as.numeric %>%
#' divide_by_int(15) %>%
#' add(1)
extract <- `[`
#' @rdname aliases
#' @usage NULL
#' @export
extract2 <- `[[`
#' @rdname aliases
#' @usage NULL
#' @export
use_series <- `$`
#' @rdname aliases
#' @usage NULL
#' @export
add <- `+`
#' @rdname aliases
#' @usage NULL
#' @export
subtract <- `-`
#' @rdname aliases
#' @usage NULL
#' @export
multiply_by <- `*`
#' @rdname aliases
#' @usage NULL
#' @export
multiply_by_matrix <- `%*%`
#' @rdname aliases
#' @usage NULL
#' @export
divide_by <- `/`
#' @rdname aliases
#' @usage NULL
#' @export
divide_by_int <- `%/%`
#' @rdname aliases
#' @usage NULL
#' @export
raise_to_power <- `^`
#' @rdname aliases
#' @usage NULL
#' @export
and <- `&`
#' @rdname aliases
#' @usage NULL
#' @export
or <- `|`
#' @rdname aliases
#' @usage NULL
#' @export
mod <- `%%`
#' @rdname aliases
#' @usage NULL
#' @export
is_in <- `%in%`
#' @rdname aliases
#' @usage NULL
#' @export
equals <- `==`
#' @rdname aliases
#' @usage NULL
#' @export
is_greater_than <- `>`
#' @rdname aliases
#' @usage NULL
#' @export
is_weakly_greater_than <- `>=`
#' @rdname aliases
#' @usage NULL
#' @export
is_less_than <- `<`
#' @rdname aliases
#' @usage NULL
#' @export
is_weakly_less_than <- `<=`
#' @rdname aliases
#' @usage NULL
#' @export
not <- `!`
#' @rdname aliases
#' @usage NULL
#' @export
`n'est pas` <- `!`
#' @rdname aliases
#' @usage NULL
#' @export
set_colnames <- `colnames<-`
#' @rdname aliases
#' @usage NULL
#' @export
set_rownames <- `rownames<-`
#' @rdname aliases
#' @usage NULL
#' @export
set_names <- `names<-`
#' @rdname aliases
#' @usage NULL
#' @export
set_class <- `class<-`
#' @rdname aliases
#' @usage NULL
#' @export
inset <- `[<-`
#' @rdname aliases
#' @usage NULL
#' @export
inset2 <- `[[<-`
#' @rdname aliases
#' @usage NULL
#' @export
set_attr <- `attr<-`
#' @rdname aliases
#' @usage NULL
#' @export
set_attributes <- `attributes<-`
##################################################################
##################################################################
##################################################################
##################################################################
##################################################################
### simple functions for volatility calculations.
simple_vol<- function(x, na.rm=T){
res <- sd(x, na.rm=na.rm)/mean(x, na.rm=na.rm)
return(res)
}
simple_semi_sd <- function(x, na.rm=T){
mm <- mean(x, na.rm=na.rm)
res <- sd(x[x<mm], na.rm=na.rm)
return(res)
}
simple_vol_down <- function(x, na.rm=T){
mm <- mean(x, na.rm=na.rm)
ss <- sd(x[x<mm], na.rm=na.rm)
res <- ss/mm
return(res)
}
## a function to take unique values of a vector and sort them
su <- function(x, decreasing=F, ...){
## res <- x %>% unique %>% sort(decreasing=decreasing, ...)
res <- x |>
unique() |>
sort(decreasing=decreasing, ...)
return(res)
}
## an improved version of tabyl from janitor.
tabyl_new <- function(x, ...){
res <- tabyl(x, ...) |> ## %>%
as_tibble() |> ## %>%
clean_names()
return(res)
}
####functions to read excel files
read_excel <- function(x, ...){
res <- read.xlsx(x, ...) %>%
as_tibble()
return(res)
}
read_excel_to_char <- function(x, ...){
res <- ## readxl::
read_excel(x, ...) |> ## %>%
## as_tibble %>%
all_to_char()
return(res)
}
read_csv_to_char <- function(x, ...){
res <- read_csv(x, ...) |> ## %>%
as_tibble() |> ## %>%
all_to_char()
return(res)
}
read_excel_names <- function(x, ...){
res <- ## readxl::
read_excel(x, ...) |> ## %>%
## as_tibble %>%
clean_names()
return(res)
}
### function to clean TAM data.
## clean_tam <- function(df_tam_ini){
## res <- df_tam_ini %>%
## clean_data() %>%
## mutate(company_name=if_else(!is.na(beneficiary_name_english),
## beneficiary_name_english,
## beneficiary_name)) %>%
## ## mutate(aid_award_granted_date=excel_numeric_to_date(aid_award_granted_date)) %>%
## mutate(across(contains("date"), ~excel_numeric_to_date(.x))) %>%
## mutate(year=year(aid_award_granted_date)) %>%
## mutate(instrument_type=if_else(!is.na(aid_award_instrument_other_english),
## aid_award_instrument_other_english,
## aid_award_instrument )) %>%
## mutate(lower_bound=str_extract_before(granted_range_eur,"-")) %>%
## mutate(upper_bound=str_extract_after(granted_range_eur,"-")) %>%
## mutate(lower_bound=as.numeric(lower_bound),
## upper_bound=as.numeric(upper_bound)) %>%
## mutate(estimated_amount=(lower_bound+upper_bound)/2/1e6) %>%
## mutate(estimated_amount=if_else(!is.na(estimated_amount),
## estimated_amount,
## granted_aid_absolute_eur/1e6))
## return(res)
## }
## ### function to clean tam read from a bunch of excel files.
## clean_tam_char <- function(df_tam_ini){
## res <- df_tam_ini %>%
## clean_data() %>%
## mutate(across(contains("aid_absolute_eur"), ~as.numeric(.x))) %>%
## mutate(across(contains("date"), ~as.numeric(.x))) %>%
## mutate(across(contains("date"), ~excel_numeric_to_date(.x))) %>%
## mutate(year=year(aid_award_granted_date)) %>%
## mutate(lower_bound=str_extract_before(granted_range_eur,"-")) %>%
## mutate(upper_bound=str_extract_after(granted_range_eur,"-")) %>%
## mutate(lower_bound=as.numeric(lower_bound),
## upper_bound=as.numeric(upper_bound)) %>%
## mutate(estimated_value=(lower_bound+upper_bound)/2) %>%
## pattern_to_na(0) %>%
## mutate(granted_value_extended_eur = case_when(
## !is.na(granted_aid_absolute_eur) ~ granted_aid_absolute_eur,
## is.na(granted_aid_absolute_eur) & !is.na(estimated_value) ~estimated_value,
## is.na(granted_aid_absolute_eur) & is.na(estimated_value) ~ nominal_aid_absolute_eur)) %>%
## mutate(nominal_value_extended_eur=
## case_when(!is.na(nominal_aid_absolute_eur) ~ nominal_aid_absolute_eur,
## is.na(nominal_aid_absolute_eur)~granted_value_extended_eur
## )) %>%
## select(-c(lower_bound, upper_bound, estimated_value))
## return(res)
## }
################################################################
################################################################
################################################################
## an easier function for the empirical cumulative function
my_ecdf <- function(x,ss=sort(x), minor=F){
n <- length(x)
res <- seq(n)
if (minor==T){
for(i in seq(n)){
res[i] <- length(x[x<ss[i]]) }
} else if (minor==F){
for(i in seq(n)){
res[i] <- length(x[x<=ss[i]]) }
}
res <- res/n
return(res)
}
### A function to wrap lines in pickerInput within shiny apps.
## See https://stackoverflow.com/questions/51355878/how-to-text-wrap-choices-from-a-pickerinput-if-the-length-of-the-choices-are-lo
shiny_wrap <- function(x, width){
res <- stringr::str_wrap(x, width) |> ## %>%
str_replace_all( "\\n", "<br>")
return(res)
}
all_tolower <- function(df){
## res <- df %>%
## mutate(across(where(is.character), ~(tolower(.x))))
res <- df |>
mutate(across(where(is.character), \(x) tolower(x)))
return(res)
}
## see https://yihui.shinyapps.io/DT-edit/
## This function is used to add a title to a table.
dt_output = function(title, id) {
fluidRow(column(
## 12, h1(paste0('Table ', sub('.*?([0-9]+)$', '\\1', id), ': ', title)),
12, h1(paste0(title)),
hr(), DTOutput(id)
))
}
### Gini coefficient from the ineq package
Gini = function (x, corr = FALSE, na.rm = TRUE)
{
if (!na.rm && any(is.na(x)))
return(NA_real_)
x <- as.numeric(na.omit(x))
n <- length(x)
x <- sort(x)
G <- sum(x * 1L:n)
G <- 2 * G/sum(x) - (n + 1L)
if (corr)
G/(n - 1L)
else G/n
}
########################################################################
########################################################################
########################################################################
########################################################################
########################################################################
## https://stackoverflow.com/questions/69461948/rdplyr-conditionally-swap-the-elements-of-two-columns
sort_rows <- function(df, col_names, dec=F){
temp <- df |> ## %>%
select(all_of(col_names))
extra_names <- setdiff(colnames(df), col_names)
temp2 <- df |> ## %>%
select(all_of(extra_names))
res <- t(apply(temp, 1, sort, decreasing=dec)) |> ## %>%
as_tibble(.name_repair = 'unique') |> ## %>%
set_colnames(col_names) |> ## %>%
bind_cols(temp2)
return(res)
}
## See https://stackoverflow.com/questions/57175351/flextable-autofit-in-a-rmarkdown-to-word-doc-causes-table-to-go-outside-page-mar
FitFlextableToPage <- function(ft, pgwidth = 6){
ft_out <- ft |> ## %>%
autofit()
ft_out <- width(ft_out, width = dim(ft_out)$widths*pgwidth /(flextable_dim(ft_out)$widths))
return(ft_out)
}
FitFlextableToPage <- function(ft, pgwidth = 6){
ft_out <- ft |> ## %>%
autofit()
ft_out <- width(ft_out, width = dim(ft_out)$widths*pgwidth /(flextable_dim(ft_out)$widths))
return(ft_out)
}
## See https://stackoverflow.com/a/63928956
FitFlextableToPage2 <- function(ft, pgwidth=7.5){
ft_out <- ft |>
autofit() |>
fit_to_width(7.5)
return(ft_out)
}
########################################################
date_to_date <- function(df){
## res <- df %>%
## mutate(across(contains("date") & where(is.numeric),
## ~excel_numeric_to_date(.x)))
res <- df |>
mutate(across(contains("date") & where(is.numeric),
\(x) excel_numeric_to_date(x)))
return(res)
}
### convert all the columns of the dataframe to character.
all_to_char <- function(df){
## res <- df %>%
## mutate(across(!where(is.character), ~as.character(.x)))
res <- df |>
mutate(across(!where(is.character), \(x) as.character(x)))
return(res)
}
eur_sym <- "\u20ac"
###################################################################
## See https://stackoverflow.com/questions/55171086/r-regex-extract-words-beginning-with-symbol
extract_words_starting_with <- function(x, pattern){
my_expr <- paste("(?<=^|\\s)", pattern, "[^\\s]+", sep="")
res <- str_extract_all(x, my_expr)
return(res)
}
### a function to convert data to a new base
as.binary <- function(n,base=2 , r=FALSE){
## function written by robin hankin
out <- NULL
while(n > 0) {
if(r) {
out <- c(out , n%%base)
} else {
out <- c(n%%base , out)
}
n <- n %/% base
}
ans <- str_c(out, collapse = "")
return(ans)
}
###################################################à
### some common metrics
## root mean square error
RMSE <- function(m, o){
sqrt(mean((m - o)^2, na.rm=T))
}
## mean absolute_error
MAE <- function(m, o){
mean(abs(m - o), na.rm=T)
}
### function to rename many columns. See
### https://community.rstudio.com/t/rename-with-a-named-vector-list-and-contribution-to-the-tidyverse/2383
rename_many <- function(df,name_new, name_old){
mm <- tibble(name_new, name_old)
mm2 <- deframe(mm)
res <- df |> ## %>%
rename(!!! mm2)
return(res)
}
## See https://stackoverflow.com/questions/35247033/using-rvest-to-extract-links
find_links <- function(url){
## links <- read_html(url) %>% html_nodes("a") %>% html_attr("href") %>%
## as_tibble %>%
## rename("url"="value")
links <- read_html(url) |>
html_nodes("a") |>
html_attr("href") |>
as_tibble() |>
rename("url"="value")
return(links)
}
## See discussion at https://github.com/tidyverse/dplyr/issues/3218
recode_many <- function(x, old_names, new_names){
name_lookup <- set_names(new_names, old_names) |> ## %>%
as.list()
res <- recode(x, !!!name_lookup)
return(res)
}
## Function to show a data frame in Excel.
# Show in Excel Function
show_in_excel = function(.data)
{
temp = paste0(tempfile(), ".csv")
utils::write.csv(.data, temp)
fs::file_show(path = temp)
}
## view dataframe in excel (as above but as an excel file.)
xview <- function(df){
fn <- paste0(tempfile(), ".xlsx")
writexl::write_xlsx(df, fn)
## system(sprintf("wslview %s", fn))
fs::file_show(path = fn)
}
## See https://stackoverflow.com/a/8267036
add_name_format_number <- function(prefix, x, n_digits, separator, myflag="0"){
res <- paste(prefix, formatC(x, width=n_digits, flag=myflag), sep=separator)
return(res)
}
## See https://www.r-bloggers.com/2022/04/robservations-30-fixing-rs-messy-string-concatenation-with-a-special-function/
"%+%" <- function(string1, string2){
paste0(string1,string2)
}
#function to partially source a file.
source2 <- function(file, start, end, ...) {
file.lines <- scan(file, what=character(), skip=start-1, nlines=end-start+1, sep='\n')
file.lines.collapsed <- paste(file.lines, collapse='\n')
source(textConnection(file.lines.collapsed), ...)
}
## see https://stackoverflow.com/questions/9508518/why-are-these-numbers-not-equal
equal_within <- function(x, y, percent){
res <- if_else((abs(x-y)/abs(pmin(x,y)))<=percent/100., T, F)
return(res)
}
calculate_return <- function(x) {
res <- c(NA,(tail(x, -1)
-head(x, -1))/head(x, -1))
return(res)
}
calculate_return_log <- function(x) {
res <- c(NA,log(tail(x, -1)/head(x, -1)))
return(res)
}
### a function to find multiple keywords simultaneously
find_text_multiple_keywords_exact_matches <- function( df, keywords){
res <- map_df(keywords,
function(x) find_text_filter_exact_matches(df, x) )
return(res)
}
### Now its parallel version. It needs library(furrr) and something like
## n_cores <- 2
## plan(multicore(workers=return_cores(n_cores)))
## to be able to use multiple cores.
find_text_multiple_keywords_exact_matches_future <- function( df, keywords){
res <- future_map_dfr(keywords,
function(x) find_text_filter_exact_matches(df, x) ,
.progress =T)
return(res)
}
## See https://statisticsglobe.com/insert-character-pattern-in-string-r
insert_char <- function(x, pos, insert) { # Create own function
gsub(paste0("^(.{", pos, "})(.*)$"),
paste0("\\1", insert, "\\2"),
x)
}
remove_char <- function(x, pattern){
res <- str_replace_all(x, pattern, "")
return(res)
}
## see https://stackoverflow.com/questions/40744463/how-to-extract-numbers-from-text
extract_numbers <- function(string){
res <- str_extract_all(string, "[0-9]+") |>
unlist()
return(res)
}
## see https://stackoverflow.com/a/24174655
remove_text_in_brackets <- function(string){
res <- str_replace(string, " \\s*\\([^\\)]+\\)", "")
return(res)
}
## function to turn an integer sequence into a sequence with a given width
seq_fixed_width <- function(myseq, width){
string <- paste("%0", width, "d", sep="")
res<- myseq |>
(\(x) sprintf(string, x))()
## sprintf("%03d", a)
return(res)
}
## pad_facets <- function(df, facet_var, cat_var) {
## df |>
## distinct({{facet_var}}, {{cat_var}}) |> # keep 1 row per facet/cat
## count({{facet_var}}) |> # how many cat per facet?
## transmute({{facet_var}}, cat_to_add = max(n) - n) |> # calc # cat to add
## uncount(cat_to_add) |> # copy each cat x times
## mutate("{{cat_var}}" := as_factor(row_number())) |> # add new factor level
## bind_rows(df) # add on original data
## }
## See https://stackoverflow.com/a/76343849/2952838
pad_facets <- function(df, facet_var, cat_var) {
df |>
distinct({{facet_var}}, {{cat_var}}) |> # keep 1 row per facet/cat
count({{facet_var}}) |> # how many cat per facet?
transmute({{facet_var}}, cat_to_add = max(n) - n) |> # calc # cat to add
uncount(cat_to_add) |> # copy each cat x times
mutate({{cat_var}} := as_factor(row_number())) |> # add new factor level
bind_rows(df) # add on original data
}
## See https://stackoverflow.com/questions/5409776/how-to-order-bars-in-faceted-ggplot2-bar-chart/5414445#5414445
sort_facets <- function(df, cat_a, cat_b, cat_out, ranking_var){
res <- df |>
mutate({{cat_out}}:=factor(paste({{cat_a}}, {{cat_b}}))) |>
mutate({{cat_out}}:=reorder({{cat_out}}, rank({{ranking_var}})))
return(res)
}
### Function for text mining using tidytext
clean_text <- function(fname){
df <- readLines(fname) |>
gsub(pattern="’", replacement= '') |>
gsub(pattern="[0-9]+", replacement="") |>
gsub(pattern="[[:punct:]]",replacement=" ")
text_df <- tibble(line = 1:length(df), text = df) ## %>% set_colnames(c("line", "text"))
return(text_df)
}
clean_unigrams <- function(tidytext, stop_words){
tidytext |>
unnest_tokens(word, text) |>
anti_join(stop_words)
}
clean_bigrams <- function(tidytext, stop_words){
tidytext |>
unnest_tokens(bigram, text, token="ngrams", n=2) |>
separate(bigram, c("word1", "word2"), sep=" ") |>
filter(!word1 %in% stop_words$word ) |>
filter(!word2 %in% stop_words$word )
}
clean_trigrams <- function(tidytext, stop_words){
tidytext |>
unnest_tokens(trigram, text, token="ngrams", n=3) |>
separate(trigram, c("word1", "word2", "word3"), sep=" ") |>
filter(!word1 %in% stop_words$word ) |>
filter(!word2 %in% stop_words$word ) |>
filter(!word3 %in% stop_words$word )
}
count_unigrams <- function(tidytext, stop_words){
tidytext |>
unnest_tokens(word, text) |>
anti_join(stop_words) |>
count(word, sort = TRUE)
}
count_bigrams <- function(tidytext, stop_words){
tidytext |>
unnest_tokens(bigram, text, token="ngrams", n=2) |>
separate(bigram, c("word1", "word2"), sep=" ") |>
filter(!word1 %in% stop_words$word ) |>
filter(!word2 %in% stop_words$word ) |>
count(word1, word2, sort=T)
}
count_trigrams <- function(tidytext, stop_words){
tidytext |>
unnest_tokens(trigram, text, token="ngrams", n=3) |>
separate(trigram, c("word1", "word2", "word3"), sep=" ") |>
filter(!word1 %in% stop_words$word ) |>
filter(!word2 %in% stop_words$word ) |>
filter(!word3 %in% stop_words$word ) |>
count(word1, word2, word3, sort=T)
}
#### function to remove the inner boundary in Cyprus
## see https://stackoverflow.com/questions/68672575/r-rnaturalearth-and-sf-remove-a-single-border-from-map
fix_cyprus <- function(ww_ini, variable, cyprus_names=c("CYP", "CYN")){
cyprus <- ww_ini |>
select({{variable}}) |>
filter({{variable}} %in% c("CYP", "CYN")) |>
mutate({{variable}} := "CYP") |>
## I renamed as Cyprus my version
group_by({{variable}}) |>
summarise()
res <- ww_ini |>
select({{variable}}) |>
filter({{variable}} %!in% c("CYP", "CYN")) |>
bind_rows(cyprus)
return(res)
}
## See https://stackoverflow.com/questions/77551546/r-naturalearth-etrs89-etrs-laea-projection?noredirect=1#comment136718692_77551546
lims_calc <- function(x1,y1,x2,y2, projection){
res <- st_sfc(st_point(c(x1, y1)), st_point(c(x2, y2)), crs = 4326) |>
st_transform(projection) |>
st_bbox()
return(res)
}
## New functions for downside risk
semivariance <- function(x){
mm <- mean(x)
x_sel <- x[x<= mean(x)]
n <- length(x_sel)
res <- 1/n*sum((x_sel-mm)**2)
return(res)
}
semideviation <- function(x){
res <- sqrt(semivariance(x))
return(res)
}
#### this works to change simple json files into tibbles
simple_json_to_tibble <- function(list_json){
##list_json comes from reading a json file with
## fromJSON("file2.json") from the jsonlite package
df_out <- list_json |>
unnest() |>
as_tibble() |>
clean_names()
return(df_out)
}
##convert simple list to dataframe
## list_to_df <- function(mylist, myid=NULL){
## res <- mylist |>
## bind_rows(.id=myid)
## return(res)
## }
list_to_df <- function(mylist, myid=NULL, source_col_name="source") {
if (!is.null(myid) && length(myid) == length(mylist)) {
names(mylist) <- myid
}
res <- bind_rows(mylist, .id = source_col_name) |>
as_tibble()
return(res)
}
### fix for a bug in binancer to get the coin prices
prices_usdt <- function(){
res <- binance_ticker_all_prices() |>
as_tibble() |>
filter(to=="USDT") |>
select(from, from_usd) |>
rename("symbol"="from",
"price_usdt"="from_usd") |>
add_row(symbol="USDT", price_usdt=1.) |>
distinct() |>
arrange(desc(price_usdt))
return(res)
}
## See https://stackoverflow.com/questions/29873293/dplyr-order-columns-alphabetically-in-r/29873357#29873357
glimpse_ord <- function(df){
## df |> select(order(colnames(df))) |>
df[,order(colnames(df))] |>
glimpse()
}
##function for jaccard similarity
jaccard <- function(a, b) {
intersection = length(intersect(a, b))
union = length(a) + length(b) - intersection
return (intersection/union)
}
#See https://stackoverflow.com/questions/41434561/remove-single-character-in-string/41434625#41434625
remove_isolated_characters <- function(x) {
res <- gsub("\\W*\\b\\w\\b\\W*", " ", x)
return(res)
}
## See https://elk.zone/fosstodon.org/@teunbrand/112701244947232879
is_equal <- function(x,y){
res <- nrow(vctrs::vec_set_symmetric_difference(scramble, mtcars)) == 0
return(res)
}
reorder_columns <- function(df){
res <- df[,order(colnames(df))]
return(res)
}
## See https://stackoverflow.com/questions/25352448/remove-urls-from-string/25352562#25352562
remove_url <- function(x){
res <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", x)
return(res)
}
## See https://stackoverflow.com/questions/68170333/is-it-possible-to-name-a-column-of-a-tibble-using-a-variable-containing-a-charac
string_list_to_tibble <- function(x, name){
res <- tibble({{name}} := unlist(x))
return(res)
}
##Explanation of the function below.
## stri_replace_all_regex(string, "[^\\p{L}\\p{N}\\p{Zs}]", "", vectorize_all = FALSE):
## [^\\p{L}\\p{N}\\p{Zs}]: This regex pattern matches any character that is not a letter, number, or space separator.
## vectorize_all = FALSE: Ensures the replacement is applied efficiently, especially for long strings.
clean_string <- function(string){
res <- stri_replace_all_regex(string, "[^\\p{L}\\p{N}\\p{Zs}]", "", vectorize_all = FALSE)
return(res)
}
## Chatgpt functions
remove_file <- function(path) {
if (file.exists(path)) {
if (file.remove(path)) {
message("File deleted successfully")
return(TRUE)
} else {
warning("Failed to delete file")
return(FALSE)
}
} else {
message("No file found at the specified path")
return(FALSE)
}
}
remove_files_with_pattern <- function(path_pattern) {
# Find all files matching the pattern
files <- Sys.glob(path_pattern)
# Check if any files are found
if (length(files) == 0) {
message("No files found matching the pattern: ", path_pattern)
return(FALSE)
}
# Try removing each file
removed_files <- file.remove(files)
if (all(removed_files)) {
message("All files deleted successfully.")
return(TRUE)
} else {
warning("Some files could not be deleted.")
return(FALSE)
}
}
name_to_integer <- function(name){
res <- utf8ToInt(name) |>
sum()
return(res)
}
transform_date_url <- function(date_str) {
# Replace the slashes with "%2F" and prepend "="
result <- gsub("/", "%2F", date_str)
result <- paste0("=", result)
return(result)
}
## function to generate n positive real number whose sum is a given total
random_sum <- function(total, n){
ss <- runif(n+1, 0, 1) |>
scale01() |>
multiply_by(total) |>
sort()
res <- diff(ss)
return(res)
}
copy_files_from_B_to_A <- function(dirA, dirB) {
# Get the list of files in directory A
files_in_A <- list.files(dirA, full.names = TRUE)
# Initialize a counter for the number of files copied
files_copied <- 0
# Loop through each file in directory A
for (fileA in files_in_A) {
# Get the base filename (without the directory path)
filename <- basename(fileA)
# Construct the corresponding file path in directory B
fileB <- file.path(dirB, filename)
# Check if the file exists in directory B
if (file.exists(fileB)) {
# Print the file being copied
message(paste("Copying", filename, "from", dirB, "to", dirA))
# Copy the file from B to A, overwriting the one in A
file.copy(fileB, fileA, overwrite = TRUE)
# Increment the counter
files_copied <- files_copied + 1
}
}
# Print the total number of files copied
message(paste("Total files copied:", files_copied))
}
# Example usage:
# copy_files_from_B_to_A("path/to/A", "path/to/B")
# Define the custom function to add a row with an arbitrary function and custom value for non-numeric columns
adorn_custom <- function(df, fun, exclude = NULL, na.rm = TRUE, position = "bottom", custom_values = list()) {
# Identify numeric columns where the function should be applied, excluding specified columns
cols_to_apply <- df |>
select(where(is.numeric)) |>
select(-all_of(exclude)) |>
colnames()
# Apply the function to the selected numeric columns
custom_row <- df |>
summarise(across(all_of(cols_to_apply), ~ fun(., na.rm = na.rm)))
# Create a new row with NA for all columns first
custom_row_full <- df[1, ] |>
summarise(across(everything(), ~ NA)) |>
mutate(across(all_of(cols_to_apply), ~ custom_row[[cur_column()]]))
# Assign custom values for non-numeric (or excluded) columns
for (col_name in names(custom_values)) {
if (col_name %in% colnames(df)) {
custom_row_full[[col_name]] <- custom_values[[col_name]]
}
}
# Add the custom row at the specified position
if (position == "top") {
df_with_custom <- bind_rows(custom_row_full, df)
} else if (position == "bottom") {
df_with_custom <- bind_rows(df, custom_row_full)
} else {
stop("Invalid position. Use 'top' or 'bottom'.")
}
return(df_with_custom)
}
## # Example usage
## # Sample dataframe with a 'Year' and 'Category' column
## df <- data.frame(
## Category = c("A", "B", "C"),
## Year = c(2020, 2021, 2022),
## Value1 = c(10, 20, 30),
## Value2 = c(40, 50, 60)
## )
## # Add a row with the mean, exclude 'Year', and set a custom value for the 'Category' column
## df_with_mean_top <- adorn_custom(
## df,
## mean,
## exclude = c("Year"),
## position = "top",
## custom_values = list(Category = "Summary")
## )
## print(df_with_mean_top)
## # Add a row with the mean, exclude 'Year', and set a custom value for the 'Category' column at the bottom
## df_with_mean_bottom <- adorn_custom(
## df,
## mean,
## exclude = c("Year"),
## position = "bottom",
## custom_values = list(Category = "Mean Summary")
## )
## print(df_with_mean_bottom)
set_nth <- function(x, n = NULL, value) {
# Handle case for "first" and "last" indices
if (is.character(n)) {
if (n == "first") {
n <- 1
} else if (n == "last") {
n <- length(x)
} else {
stop("Invalid value for 'n'. Use a numeric index, 'first', or 'last'.")
}
}
# Check if n is within bounds
if (!is.numeric(n) || n < 1 || n > length(x)) {
stop("'n' must be a valid position in 'x'")
}
# Set the nth element to the given value
x[n] <- value
return(x)
}
## example usages
## ll <- c(1,2,3,4,5) |>
## set_nth(3, 10)
## tt <- tibble(x = 1:5, y = letters[1:5])
## tt2 <- tt |>
## mutate(x=set_nth(x, 3, -10))
## function to calculate the cumulative number of unique elements by group.
cumulative_unique_by_group <- function(df, group_var, value_var) {
df |>
arrange({{ group_var }}) |>
mutate(unique_values_so_far = cumsum(!duplicated({{ value_var }}))) |>
group_by({{ group_var }}) |>
summarise(cumulative_unique_count = max(unique_values_so_far), .groups = 'drop')
}
## library(stringr)
## a function to split a string according to a separator and which
## then takes the first n characters of every element and returns a vector.
extract_first_n_chars <- function(input_string, separator = ";", n = 1) {
# Split the string by the specified separator
elements <- str_split(input_string, str_c(separator, "\\s*"))[[1]]
# Extract the first `n` characters of each element
extracted_chars <- str_sub(elements, 1, n)
return(extracted_chars)
}
## # Example usage
## ss <- "M.74.90; M.75; M.75.00; N.77.22; N.77.31; N.78; N.79; N.79.90; N.81; N.81.10; N.81.2"
## # Extract the first letter of each element
## extract_first_n_chars(ss)
## # Extract the first two characters of each element
## extract_first_n_chars(ss, n = 2)
## # Use a different separator
## custom_string <- "A, B, C, D"
## extract_first_n_chars(custom_string, separator = ",")
count_non_pattern_chars <- function(input_string, pattern) {
# Remove all characters matching the pattern and count the length of the remaining string
non_pattern_count <- str_length(str_replace_all(input_string, pattern, ""))
return(non_pattern_count)
}
##This function calculates the total per group (sector) when multiple
## sectors appear on the same line e.g. c("A", "A B C", "D G")
calculate_total_per_group <- function(sector, money_per_sector, sector_separator = " ", sector_col_output_name, value_col_output_name) {
data <- tibble(sector_col=sector, money_per_sector_col=money_per_sector )
## print(data)
res <- data |>
# Split the sectors column into individual rows based on the provided separator
separate_rows(sector_col, sep = sector_separator) |>
# Assign money per sector by repeating the corresponding value for each sector
mutate(money_for_sector = money_per_sector_col) |>
# Group by sector and calculate the total money for each sector
group_by(sector_col) |>
summarise(total_money = sum(money_for_sector, na.rm=T), .groups = "drop") |>
rename({{sector_col_output_name}} :=sector_col ,
{{value_col_output_name}} :=total_money)
return(res)
}