Library Parser

Pre-processing

Preparing data

The follow code extracts information from the .bib library to format it as a data.frame for further processing. The code makes use of the stringr package and uses regular expressions to extract relevant parameters. First we read in the .bib file and use some string manipulations to retrieve the citation keys.

library(stringr)
library(knitr, include.only = 'kable')
source(paste0(here::here(), "/R/format-study-results.R"))
source(paste0(here::here(), "/R/parse-model-output.R"))

bib_file <- read.delim('bib/extractions.bib',
           sep = '@', header = F)

# get citekeys from bibtex file:
citekeys <- unique(bib_file$V2)
# improve formatting
citekeys <- str_remove(citekeys, '\\{')
citekeys <- str_remove(citekeys, ',')
citekeys <- str_remove(citekeys, '%%.*$')
citekeys <- str_remove(citekeys, 'Article')
citekeys[citekeys ==''] <- NA
citekeys <- na.omit(citekeys)

R reads the .bib file as a two column data.frame, with the citation key appearing in the second column and the remaining metadata appearing in the first column. When the citation key appears in the second column, the corresponding row in the first column is blank. Because of this quirk, we can index metadata matching each citation key by keeping track of blank rows in the first column. We’ll append each to a new entry of a list. The name of each list entry is the citation key; the corresponding value is the remaining unprocessed metadata.

# find where new entries begin:
new_entries = which(bib_file$V2 != '')

# loop across unique indices for each entry
meta_list = list()
# loop across unique indices for each entry
meta_list = list()
for(this_entry in 1:(length(new_entries)-1))
{
  # get unique citekey
  this_cite_key <- citekeys[this_entry]
  # capture lines following citekey
  corresponding_lines <- bib_file[new_entries[this_entry]:new_entries[this_entry+1]-1,]$V1
  # store matching lines as data frame
  corresponding_lines <- data.frame(corresponding_lines)
  # assign lines distinct name
  names(corresponding_lines) <- this_cite_key
  # add to a list for further processing
  meta_list <- append(meta_list, corresponding_lines)
}

Extracting Relevant .bib Fields

Not every bibtex field is equally useful for analysis. To facilitate data manipulation, we can save the names of the target fields separately in a .txt file, and use a regular expression to create a new column each time R finds one of the target fields in a string containing the bibtex metadata.

# read in target bibtex fields
search_fields <- field_names <- readLines('bibtex_fields.txt')

# match casing in bibtex file
field_names <- toupper(field_names)
# add a pattern allowing us to find text between two adjacent bibtex fields
rep_pattern <- paste0(field_names[1:length(field_names)-1], '\\s*(.*?)\\s')
# apply this same pattern to all but the last of the field names                  
field_names[1:length(field_names)-1] <- rep_pattern
# collapse all the new field names into a single string for string manipulation with stringr
field_names[length(field_names)] <- paste0(field_names[length(field_names)], '.*')
field_names <- paste0(field_names, collapse = '')

Prepare dataframe

Now we can convert our list into a data frame with the target bibtex fields. For the last field MODEL_VALIDATION, we will apply a different regex pattern which matches all characters following the field name (?<=MODEL_VALIDATION).* .

# create new column containing information between two adjacent target fields for all entries in list
meta_df <- lapply(meta_list, function(x) str_match(paste0(x, collapse = ' '), field_names))
meta_df <- lapply(meta_list, function(x) str_match(paste0(x, collapse = ' '), field_names))

# collapse list entries into rows
meta_df <- do.call('rbind', meta_df)
# format as a data.frame
meta_df <- data.frame(meta_df)
# match text after final column name
meta_df[,ncol(meta_df)+1] <- sapply(meta_df[,1], function(x) str_match(paste0(x, collapse = ' '), '(?<=FINAL_NOTES).*'))
# replace first column with citationkeys
meta_df[,1] <- names(meta_list)
names(meta_df) <- c('citekey', search_fields)
names(meta_df) <- trimws(names(meta_df))

Formatting

Finally, we’ll perform some formatting to remove unwanted characters left over following the conversion (in progress)

## remove bibtext field formatting
# remove curly braces
meta_df <- apply(meta_df, 2, function(x) str_remove_all(x, '\\{')) 
meta_df <- apply(meta_df, 2, function(x) str_remove_all(x, '\\},'))
# remove first '=' (from bibtex field )
meta_df <- apply(meta_df, 2, function(x) str_remove(x, '='))
# remove double-commas
#meta_df <- apply(meta_df, 2, function(x) str_remove_all(x, ',,'))
# remove comments
meta_df <- apply(meta_df, 2, function(x) str_remove_all(x, '%%.*'))
#meta_df <- apply(meta_df, 2, function(x) str_remove_all(x, ' , '))
# remove extra characters in final column
meta_df[, ncol(meta_df)] = str_remove_all(meta_df[, ncol(meta_df)], '\\}')
meta_df <- as.data.frame(meta_df)

Example: Evaluate model_rate_emotion_values as R code

eval(parse(text = meta_df$model_rate_emotion_values[22]))

                                                          arousal_r2
various.svr.mixed.classical.1                                 0.7249
various.sparse bayesian regression.mixed.classical.1          0.7381
various.variational bayesian regression.mixed.classical.1     0.7108
                                                          arousal_variance explained
various.svr.mixed.classical.1                                                 0.7556
various.sparse bayesian regression.mixed.classical.1                          0.7395
various.variational bayesian regression.mixed.classical.1                     0.7415
                                                          valence_r2
various.svr.mixed.classical.1                                 0.6119
various.sparse bayesian regression.mixed.classical.1          0.6296
various.variational bayesian regression.mixed.classical.1     0.6328
                                                          valence_variance explained
various.svr.mixed.classical.1                                                 0.6142
various.sparse bayesian regression.mixed.classical.1                          0.6376
various.variational bayesian regression.mixed.classical.1                     0.6340
                                                          resonance_r2
various.svr.mixed.classical.1                                   0.5374
various.sparse bayesian regression.mixed.classical.1            0.5456
various.variational bayesian regression.mixed.classical.1       0.5554
                                                          resonance_variance explained
various.svr.mixed.classical.1                                                   0.5496
various.sparse bayesian regression.mixed.classical.1                            0.5558
various.variational bayesian regression.mixed.classical.1                       0.5630

Track Excluded Studies (During Extraction)

meta_df[which(str_detect(meta_df$final_notes, '!EXCL!')),] |> dplyr::tibble()

# A tibble: 12 × 28
   citekey       paradigm notes_ca notes_te emotions emotion_locus stimulus_type
   <chr>         <chr>    <chr>    <chr>    <chr>    <chr>         <chr>        
 1 "bai2017mu"   " class… " inclu… " inclu… " valen… " perceived … " MediaEval …
 2 "feng2024ex"  " class… " inclu… " inclu… "    "   "    "        "    "       
 3 "nag2022on"   " class… " inclu… " inclu… " happy… "    "        "    "       
 4 "bai2016di"   " regre… " inclu… " inclu… " valen… " perceived … " MediaEval …
 5 "cao2023th"   " regre… " inclu… " inclu… " valen… " perceived … " million so…
 6 "chin2018pr"  " regre… " inclu… " inclu… " valen… " perceived … " NTUMIR, Me…
 7 "malheiro201… " regre… " inclu… " inclu… "    "   "    "        "    "       
 8 "medina2020e… " class… " inclu… " inclu… " valen… " perceived … " MediaEval …
 9 "panwar2019a… " regre… " inclu… " inclu… "    "   "    "        "    "       
10 "vempala2024… " regre… " inclu… " inclu… " valen… " perceived … " classical …
11 "xia2022st"   " regre… " inclu… " inclu… "   "    "    "        "   "        
12 "zhang2024ap" " regre… " inclu… " inclu… "    "   "    "        "    "       
# ℹ 21 more variables: stimulus_genre <chr>, stimulus_duration <chr>,
#   stimulus_duration_unit <chr>, stimulus_n <chr>, feature_n <chr>,
#   participant_n <chr>, participant_expertise <chr>, participant_origin <chr>,
#   participant_sampling <chr>, participant_task <chr>,
#   feature_categories <chr>, feature_source <chr>,
#   feature_reduction_method <chr>, model_category <chr>, model_detail <chr>,
#   model_measure <chr>, model_complexity_parameters <chr>, …

meta_df[-which(str_detect(meta_df$final_notes, '!EXCL!')),] -> included_studies

included_studies |> dplyr::tibble()

# A tibble: 34 × 28
   citekey       paradigm notes_ca notes_te emotions emotion_locus stimulus_type
   <chr>         <chr>    <chr>    <chr>    <chr>    <chr>         <chr>        
 1 agarwal2021an " class… " inclu… " inclu… " discr… " perceived … " Hindi musi…
 2 alvarez2023ri " class… " inclu… " inclu… " discr… " perceived … " Spotify "  
 3 bhuvanakumar… " class… " inclu… " inclu… " quadr… " not specif… " EMOPIA; po…
 4 dufour2021us  " class… " inclu… " inclu… "  c1 (… " perceived … " pop, disco…
 5 hizlisoy2021… " class… " inclu… " inclu… " valen… " perceived … " Turkish tr…
 6 nguyen2017an  " class… " inclu… " inclu… " 288 e… " perceived … " pop  "     
 7 panda2020no   " class… " inclu… " inclu… " valen… " perceived … " AllMusic  "
 8 sorussa2020em " class… " inclu… " inclu… " valen… " perceived … " DEAM  "    
 9 yang2021an    " class… " inclu… " inclu… " happy… " perceived … " MediaEval …
10 yeh2014po     " class… " inclu… " inclu… " happy… " perceived … " chorus of …
# ℹ 24 more rows
# ℹ 21 more variables: stimulus_genre <chr>, stimulus_duration <chr>,
#   stimulus_duration_unit <chr>, stimulus_n <chr>, feature_n <chr>,
#   participant_n <chr>, participant_expertise <chr>, participant_origin <chr>,
#   participant_sampling <chr>, participant_task <chr>,
#   feature_categories <chr>, feature_source <chr>,
#   feature_reduction_method <chr>, model_category <chr>, model_detail <chr>, …

Track Incomplete Extractions

Count studies where there are empty fields (exclude final_notes in count)

check_missing <- function(n_empty_fields = 1) 
{
  included_studies[which(
    rowSums(
      sapply(
        included_studies[,1:ncol(included_studies) - 1],
        function(x) grepl("^\\s*$", x))) > n_empty_fields - 1),]
}

## AT LEAST 1 MISSING FIELD
check_missing() -> missing_1

missing_1 |> dplyr::tibble()

# A tibble: 4 × 28
  citekey        paradigm notes_ca notes_te emotions emotion_locus stimulus_type
  <chr>          <chr>    <chr>    <chr>    <chr>    <chr>         <chr>        
1 panda2020no    " class… " inclu… " inclu… " valen… " perceived … " AllMusic  "
2 beveridge2018… " regre… " inclu… " inclu… " valen… " perceived"  " stimuli co…
3 deng2015em     " regre… " inclu… " inclu… " valen… " perceived … " Western cl…
4 gingras2014be  " regre… " inclu… " inclu… " pleas… " perceived … " piano trio…
# ℹ 21 more variables: stimulus_genre <chr>, stimulus_duration <chr>,
#   stimulus_duration_unit <chr>, stimulus_n <chr>, feature_n <chr>,
#   participant_n <chr>, participant_expertise <chr>, participant_origin <chr>,
#   participant_sampling <chr>, participant_task <chr>,
#   feature_categories <chr>, feature_source <chr>,
#   feature_reduction_method <chr>, model_category <chr>, model_detail <chr>,
#   model_measure <chr>, model_complexity_parameters <chr>, …

## AT LEAST 2 MISSING FIELDS
check_missing(2) |> dplyr::tibble()

# A tibble: 0 × 28
# ℹ 28 variables: citekey <chr>, paradigm <chr>, notes_ca <chr>,
#   notes_te <chr>, emotions <chr>, emotion_locus <chr>, stimulus_type <chr>,
#   stimulus_genre <chr>, stimulus_duration <chr>,
#   stimulus_duration_unit <chr>, stimulus_n <chr>, feature_n <chr>,
#   participant_n <chr>, participant_expertise <chr>, participant_origin <chr>,
#   participant_sampling <chr>, participant_task <chr>,
#   feature_categories <chr>, feature_source <chr>, …

## AT LEAST 3 MISSING FIELDS
check_missing(3) |> dplyr::tibble()

# A tibble: 0 × 28
# ℹ 28 variables: citekey <chr>, paradigm <chr>, notes_ca <chr>,
#   notes_te <chr>, emotions <chr>, emotion_locus <chr>, stimulus_type <chr>,
#   stimulus_genre <chr>, stimulus_duration <chr>,
#   stimulus_duration_unit <chr>, stimulus_n <chr>, feature_n <chr>,
#   participant_n <chr>, participant_expertise <chr>, participant_origin <chr>,
#   participant_sampling <chr>, participant_task <chr>,
#   feature_categories <chr>, feature_source <chr>, …

Parsing Studies

The following functions provide a working example for converting the R code contained in model_rate_emotion_values to a data frame for further analysis. get_study_summaries is a simple function to demonstrate how the preprocessing routines work in the following examples

# high level function to apply parse_model_output to multiple studies
get_study_summaries <- function(df) {
do.call(rbind,
  lapply(df$model_rate_emotion_values, 
         FUN = function(x) {
           study_id <- unique(df$citekey[which(df$model_rate_emotion_values == x)])
           model_results <- parse_model_output(x)
           return(cbind(study_id, model_results))
           }
         )
  )
}

Regression studies

Encoded values should use the following nomenclature. In this synthetic example, dim_measure.summary2 exists for the study in the second row, but not the first, so an NA is encoded for summary2 in row 1.

bind_field(
  'library.model.features.data.exp1' = c(dim_measure1.summary = 0.5, dim_measure2.summary = 0.1), 
  'library.model.features.data.exp2' = c(dim_measure1.summary = 0.2, dim_measure2.summary = 0.5, dim_measure.summary2 = 0.3)
)

                                 dim_measure.summary2 dim_measure1.summary
library.model.features.data.exp1                   NA                  0.5
library.model.features.data.exp2                  0.3                  0.2
                                 dim_measure2.summary
library.model.features.data.exp1                  0.1
library.model.features.data.exp2                  0.5

Example 2

This example is more extensive, and shows how this procedure can work with multiple studies:

regression_example <- data.frame(citekey = c('testStudy2023aa','testStudy2023bb', 'testStudy2020cc'),
  model_rate_emotion_values = c("bind_field('marsyas.random forest.mfcc.msd.1' = c(valence_r2.mean = 0.9, arousal_r2.mean = 0.5,
 valence_r2.sd = 0.4, arousal_r2.sd = 0.3),
  'essentia.random forest.mfcc.deam.1' = c(valence_r2.mean = 0.9, arousal_r2.mean = 0.5))",
  "bind_field('librosa.linear regression.mfcc.ismir.1' = c(valence_r2.mean = 0.1, arousal_r2.mean = 0.3, euclid_euclid.mean = 0.32))",
  "bind_field('mirtoolbox.neural net.pitch.filmMusic.2' = c(valence_r2.mean = 0.1, arousal_r2.mean = 0.3))")
)


dplyr::tibble(get_study_summaries(regression_example)) |>
  knitr::kable()

Warning in fill_missing_names(...): Mismatch in input lengths.

study_id	library_id	model_id	feature_id	data_id	experiment_id	dimension	measure	statistic	values
testStudy2023aa	marsyas	random forest	mfcc	msd	1	arousal	r2	mean	0.50
testStudy2023aa	marsyas	random forest	mfcc	msd	1	arousal	r2	sd	0.30
testStudy2023aa	marsyas	random forest	mfcc	msd	1	valence	r2	mean	0.90
testStudy2023aa	marsyas	random forest	mfcc	msd	1	valence	r2	sd	0.40
testStudy2023aa	essentia	random forest	mfcc	deam	1	arousal	r2	mean	0.50
testStudy2023aa	essentia	random forest	mfcc	deam	1	arousal	r2	sd	NA
testStudy2023aa	essentia	random forest	mfcc	deam	1	valence	r2	mean	0.90
testStudy2023aa	essentia	random forest	mfcc	deam	1	valence	r2	sd	NA
testStudy2023bb	librosa	linear regression	mfcc	ismir	1	valence	r2	mean	0.10
testStudy2023bb	librosa	linear regression	mfcc	ismir	1	arousal	r2	mean	0.30
testStudy2023bb	librosa	linear regression	mfcc	ismir	1	euclid	euclid	mean	0.32
testStudy2020cc	mirtoolbox	neural net	pitch	filmMusic	2	valence	r2	mean	0.10
testStudy2020cc	mirtoolbox	neural net	pitch	filmMusic	2	arousal	r2	mean	0.30

Classification studies

For classification studies, we can extract results from a matrix-like representation.

unflatten is a convenience function for preparing a confusion matrix, and summarize_matrix calls on the caret package to compute summary statistics for the matrix.

bind_field(
  lapply(
    list(
      'library.model.features.data.exp' = unflatten(
      "Class_A" = 0.8, "Class_B" = 0.2,
      0.2, 0.8
      ),
      'library.model.features.data.exp' = unflatten(
      "Class_A" = 0.4, "Class_B" = 0.6,
      0.6, 0.4
      )
    ),
    summarize_matrix
  )
)

                                classification_accuracy
library.model.features.data.exp                     0.8
library.model.features.data.exp                     0.4
                                classification_accuracy.lower
library.model.features.data.exp                            NA
library.model.features.data.exp                            NA
                                classification_accuracy.null
library.model.features.data.exp                           NA
library.model.features.data.exp                           NA
                                classification_accuracy.pvalue
library.model.features.data.exp                             NA
library.model.features.data.exp                             NA
                                classification_accuracy.upper
library.model.features.data.exp                            NA
library.model.features.data.exp                            NA
                                classification_class.n classification_kappa
library.model.features.data.exp                      2                  0.6
library.model.features.data.exp                      2                 -0.2
                                classification_mcc
library.model.features.data.exp                0.6
library.model.features.data.exp               -0.2
                                classification_mcnemar.pvalue
library.model.features.data.exp                             1
library.model.features.data.exp                             1

classification_example <- data.frame(
  citekey = "classificationStudy2020aa",
  model_rate_emotion_values = 
  "bind_field(
    lapply(
      list(
        'marsyas.smo unambiguous.mixed.new.1' = unflatten(
          'A' = 52.6, 'B' = 17.1, 'C' = 0.0,
          12, 65.2, 7.6, 
          1.1,9.9,73.6),
        'marsyas.smo full.mixed.new.1' = unflatten(
          'C1' = 56.0, 'C2' = 19, 'C3' = 0.0, 'C4' = 2, 
          11.3,58.9,13.7,15,
          0,13.2,68.6, 3,
          15,1,4, 6)
      ),
      summarize_matrix
    )
  )"
)


dplyr::tibble(get_study_summaries(classification_example)) |>
  knitr::kable()

study_id	library_id	model_id	feature_id	data_id	experiment_id	dimension	measure	statistic	values
classificationStudy2020aa	marsyas	smo unambiguous	mixed	new	1	classification	accuracy	NA	0.8005019
classificationStudy2020aa	marsyas	smo unambiguous	mixed	new	1	classification	accuracy	lower	NA
classificationStudy2020aa	marsyas	smo unambiguous	mixed	new	1	classification	accuracy	null	NA
classificationStudy2020aa	marsyas	smo unambiguous	mixed	new	1	classification	accuracy	pvalue	NA
classificationStudy2020aa	marsyas	smo unambiguous	mixed	new	1	classification	accuracy	upper	NA
classificationStudy2020aa	marsyas	smo unambiguous	mixed	new	1	classification	class	n	3.0000000
classificationStudy2020aa	marsyas	smo unambiguous	mixed	new	1	classification	kappa	NA	0.6990861
classificationStudy2020aa	marsyas	smo unambiguous	mixed	new	1	classification	mcc	NA	0.6998487
classificationStudy2020aa	marsyas	smo unambiguous	mixed	new	1	classification	mcnemar	pvalue	0.5132684
classificationStudy2020aa	marsyas	smo full	mixed	new	1	classification	accuracy	NA	0.6609697
classificationStudy2020aa	marsyas	smo full	mixed	new	1	classification	accuracy	lower	NA
classificationStudy2020aa	marsyas	smo full	mixed	new	1	classification	accuracy	null	NA
classificationStudy2020aa	marsyas	smo full	mixed	new	1	classification	accuracy	pvalue	NA
classificationStudy2020aa	marsyas	smo full	mixed	new	1	classification	accuracy	upper	NA
classificationStudy2020aa	marsyas	smo full	mixed	new	1	classification	class	n	4.0000000
classificationStudy2020aa	marsyas	smo full	mixed	new	1	classification	kappa	NA	0.5257194
classificationStudy2020aa	marsyas	smo full	mixed	new	1	classification	mcc	NA	0.5260629
classificationStudy2020aa	marsyas	smo full	mixed	new	1	classification	mcnemar	pvalue	NaN