--- title: "Analyzing Similarities in Conversational Sequences across Multiple Dyads" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Analyzing Similarities in Conversational Sequences across Multiple Dyads} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ## Introduction This vignette demonstrates how to use the functions provided in the `conversation_multidyads.R` file to analyze conversations across multiple dyads. These functions allow you to preprocess conversation data and calculate various similarity measures between conversation participants. ## Setup Load the library: ```{r, echo=FALSE, message=FALSE} library(lme4) library(ggplot2) library(topicmodels) library(tm) library(slam) ``` ```{r, message=FALSE} library(conversim) ``` ## Loading the Data We'll use the provided dataset "dyad_example_data.Rdata" located in the inst/extdata directory of the package: ```{r load_data} data_path <- system.file("extdata", "dyad_example_data.Rdata", package = "conversim") load(data_path) # Display the first few rows and structure of the data head(dyad_example_data) str(dyad_example_data) ``` ## Preprocessing Before analyzing the conversations, we need to preprocess the text data: ```{r echo=FALSE} preprocess_dyads <- function(conversations) { conversations$processed_text <- sapply(conversations$text, function(text) { text <- tolower(text) text <- gsub("[[:punct:]]", "", text) text <- gsub("[[:digit:]]", "", text) text <- gsub("\\s+", " ", trimws(text)) return(text) }) # Remove empty processed texts conversations <- conversations[nchar(conversations$processed_text) > 0, ] return(conversations) } ``` ```{r preprocess} processed_convs <- preprocess_dyads(dyad_example_data) head(dyad_example_data) ``` ## Calculating Similarities Now, let's calculate various similarity measures for our preprocessed conversations. ### Topic Similarity ```{r, echo=FALSE} topic_sim_dyads <- function(conversations, method = "lda", num_topics = 2, window_size = 3) { if (!requireNamespace("lme4", quietly = TRUE)) { stop("Package 'lme4' is required for this function. Please install it.") } if (!requireNamespace("topicmodels", quietly = TRUE)) { stop("Package 'topicmodels' is required for this function. Please install it.") } if (!requireNamespace("tm", quietly = TRUE)) { stop("Package 'tm' is required for this function. Please install it.") } if (!requireNamespace("slam", quietly = TRUE)) { stop("Package 'slam' is required for this function. Please install it.") } dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] similarities <- c() for (i in 1:(nrow(dyad_conv) - window_size + 1)) { window <- dyad_conv$processed_text[i:(i+window_size-1)] # Create a document-term matrix corpus <- tm::Corpus(tm::VectorSource(window)) dtm <- tm::DocumentTermMatrix(corpus) # Check if the DTM is empty or has any empty documents using slam if (sum(slam::col_sums(dtm) > 0) == 0) { similarities <- c(similarities, NA) next } # Remove empty documents using slam dtm <- dtm[slam::row_sums(dtm) > 0, ] if (method == "lda") { tryCatch({ lda_model <- topicmodels::LDA(dtm, k = num_topics, control = list(seed = 1234)) topics <- topicmodels::topics(lda_model) sim <- sum(topics[1:(window_size/2)] == topics[(window_size/2+1):window_size]) / (window_size/2) }, error = function(e) { sim <- NA }) } else { stop("Unsupported method. Only 'lda' is currently implemented.") } similarities <- c(similarities, sim) } all_similarities[[as.character(dyad)]] <- similarities } # Prepare data for multilevel modeling model_data <- data.frame( dyad_id = rep(dyads, sapply(all_similarities, length)), similarity = unlist(all_similarities) ) # Remove NA values model_data <- model_data[!is.na(model_data$similarity), ] # Fit multilevel model model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data) # Extract overall average similarity accounting for dyad-level variation overall_average <- lme4::fixef(model)[1] return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) } ``` ```{r topic_similarity, message=FALSE} topic_sim <- topic_sim_dyads(processed_convs, method = "lda", num_topics = 5, window_size = 3) ``` ### Lexical Similarity ```{r, echo=FALSE} lexical_sim_dyads <- function(conversations, window_size = 3) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] similarities <- c() for (i in 1:(nrow(dyad_conv) - window_size + 1)) { window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ") window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ") sim <- conversim::lexical_similarity(window1, window2) similarities <- c(similarities, sim) } all_similarities[[as.character(dyad)]] <- similarities } # Prepare data for multilevel modeling model_data <- data.frame( dyad_id = rep(dyads, sapply(all_similarities, length)), similarity = unlist(all_similarities) ) # Fit multilevel model model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data) # Extract overall average similarity accounting for dyad-level variation overall_average <- fixef(model)[1] return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) } ``` ```{r lexical_similarity, message=FALSE} lexical_sim <- lexical_sim_dyads(processed_convs, window_size = 3) ``` ### Semantic Similarity ```{r, echo=FALSE} semantic_similarity <- function(conversation1, conversation2, method = "tfidf", model_path = NULL, dim = 100, window = 5, iter = 5) { # Internal function to calculate cosine similarity cosine_similarity <- function(a, b) { if (length(a) == 0 || length(b) == 0) return(0) sim <- sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2))) # Ensure the result is between 0 and 1 return((sim + 1) / 2) } # Internal function to load pre-trained GloVe embeddings load_glove <- function(file_path) { tryCatch({ conn <- file(file_path, "r") lines <- readLines(conn) close(conn) split_lines <- strsplit(lines, " ") words <- sapply(split_lines, `[`, 1) vectors <- t(sapply(split_lines, function(x) as.numeric(x[-1]))) rownames(vectors) <- words return(vectors) }, error = function(e) { stop(paste("Error loading GloVe file:", e$message)) }) } # Internal function to calculate sentence embedding sentence_embedding <- function(sentence, word_vectors) { tokens <- unlist(strsplit(sentence, "\\s+")) valid_tokens <- tokens[tokens %in% rownames(word_vectors)] if (length(valid_tokens) == 0) { return(rep(0, ncol(word_vectors))) } embeddings <- word_vectors[valid_tokens, , drop = FALSE] if (nrow(embeddings) == 0) return(rep(0, ncol(word_vectors))) return(colMeans(embeddings)) } if (method == "tfidf") { # TF-IDF approach corpus <- c(conversation1, conversation2) dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus))) tfidf <- weightTfIdf(dtm) m <- as.matrix(tfidf) # Issue a warning for short conversations or little vocabulary overlap if (nchar(conversation1) < 50 || nchar(conversation2) < 50 || ncol(m) < 5) { warning("The 'tfidf' method may not provide highly meaningful results for short conversations or those with little vocabulary overlap. Consider using 'word2vec' or 'glove' methods for more robust results.") } # If the conversations are identical, return 1 if (identical(conversation1, conversation2)) { return(1) } # Ensure we have at least one term in common if (ncol(m) == 0) { return(0) } # Calculate cosine similarity similarity <- cosine_similarity(m[1,], m[2,]) } else if (method == "word2vec" || method == "glove") { # Word2Vec or GloVe approach if (method == "word2vec") { # Train Word2Vec model all_text <- c(conversation1, conversation2) model <- word2vec(x = all_text, dim = dim, iter = iter, window = window, min_count = 1) word_vectors <- as.matrix(model) } else { # method == "glove" if (is.null(model_path)) { stop("Please provide a path to the pre-trained GloVe file.") } # Load pre-trained GloVe vectors word_vectors <- load_glove(model_path) } # Calculate embeddings for each conversation embedding1 <- sentence_embedding(conversation1, word_vectors) embedding2 <- sentence_embedding(conversation2, word_vectors) # Calculate cosine similarity similarity <- cosine_similarity(embedding1, embedding2) } else { stop("Invalid method. Choose 'tfidf', 'word2vec', or 'glove'.") } return(similarity) } semantic_sim_dyads <- function(conversations, method = "tfidf", window_size = 3, ...) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] similarities <- c() for (i in 1:(nrow(dyad_conv) - window_size + 1)) { window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ") window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ") sim <- semantic_similarity(window1, window2, method, ...) similarities <- c(similarities, sim) } all_similarities[[as.character(dyad)]] <- similarities } # Prepare data for multilevel modeling model_data <- data.frame( dyad_id = rep(dyads, sapply(all_similarities, length)), similarity = unlist(all_similarities) ) # Fit multilevel model model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data) # Extract overall average similarity accounting for dyad-level variation overall_average <- fixef(model)[1] return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) } ``` ```{r semantic_similarity, message=FALSE, , warning=FALSE, results='hide'} semantic_sim <- semantic_sim_dyads(processed_convs, method = "tfidf", window_size = 3) ``` ### Structural Similarity ```{r, echo=FALSE} structural_sim_dyads <- function(conversations) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] length_sim <- 1 turn_lengths <- nchar(dyad_conv$processed_text) turn_length_sim <- 1 - sd(turn_lengths) / mean(turn_lengths) speaker_changes <- sum(dyad_conv$speaker[-1] != dyad_conv$speaker[-nrow(dyad_conv)]) speaker_change_sim <- 1 - abs(speaker_changes - (nrow(dyad_conv) / 2)) / (nrow(dyad_conv) / 2) similarity <- mean(c(length_sim, turn_length_sim, speaker_change_sim)) all_similarities[[as.character(dyad)]] <- similarity } # Calculate overall average using simple mean overall_average <- mean(unlist(all_similarities)) # Print warning about not using multilevel modeling warning("Only one observation per dyad. Using simple mean for overall average instead of multilevel modeling.") return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) } ``` ```{r structural_similarity, message=FALSE, warning=FALSE} structural_sim <- structural_sim_dyads(processed_convs) ``` ### Stylistic Similarity ```{r, echo=FALSE} stylistic_similarity <- function(text1, text2) { # Helper function to calculate features for a single text calculate_features <- function(text) { words <- strsplit(text, " ")[[1]] sentences <- strsplit(text, "\\. ")[[1]] ttr <- length(unique(words)) / length(words) avg_sentence_length <- mean(sapply(sentences, function(s) length(strsplit(s, " ")[[1]]))) syllables <- sum(sapply(words, function(w) max(1, nchar(gsub("[^aeiouAEIOU]", "", w))))) fk_grade <- 0.39 * (length(words) / length(sentences)) + 11.8 * (syllables / length(words)) - 15.59 c(ttr = ttr, avg_sentence_length = avg_sentence_length, fk_grade = fk_grade) } features1 <- calculate_features(text1) features2 <- calculate_features(text2) feature_diff <- abs(features1 - features2) overall_similarity <- 1 - mean(feature_diff / pmax(features1, features2)) normalized1 <- (features1 - mean(features1)) / sd(features1) normalized2 <- (features2 - mean(features2)) / sd(features2) cosine_similarity <- sum(normalized1 * normalized2) / (sqrt(sum(normalized1^2)) * sqrt(sum(normalized2^2))) list( text1_features = features1, text2_features = features2, feature_differences = feature_diff, overall_similarity = overall_similarity, cosine_similarity = cosine_similarity ) } stylistic_sim_dyads <- function(conversations, window_size = 3) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] similarities <- c() for (i in 1:(nrow(dyad_conv) - window_size + 1)) { window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ") window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ") sim <- stylistic_similarity(window1, window2)$overall_similarity similarities <- c(similarities, sim) } all_similarities[[as.character(dyad)]] <- similarities } # Prepare data for multilevel modeling model_data <- data.frame( dyad_id = rep(dyads, sapply(all_similarities, length)), similarity = unlist(all_similarities) ) # Fit multilevel model model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data) # Extract overall average similarity accounting for dyad-level variation overall_average <- fixef(model)[1] return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) } ``` ```{r stylistic_similarity, message=FALSE} stylistic_sim <- stylistic_sim_dyads(processed_convs, window_size = 3) ``` ### Sentiment Similarity ```{r, echo=FALSE} sentiment_sim_dyads <- function(conversations, window_size = 3) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] similarities <- c() for (i in 1:(nrow(dyad_conv) - window_size + 1)) { window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ") window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ") sim <- conversim::sentiment_similarity(window1, window2) similarities <- c(similarities, sim) } all_similarities[[as.character(dyad)]] <- similarities } # Prepare data for multilevel modeling model_data <- data.frame( dyad_id = rep(dyads, sapply(all_similarities, length)), similarity = unlist(all_similarities) ) # Fit multilevel model model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data) # Extract overall average similarity accounting for dyad-level variation overall_average <- fixef(model)[1] return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) } ``` ```{r sentiment_similarity, message=FALSE} sentiment_sim <- sentiment_sim_dyads(processed_convs, window_size = 3) ``` ### Participant Similarity ```{r, echo=FALSE} participant_sim_dyads <- function(conversations) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] speakers <- table(dyad_conv$speaker) / nrow(dyad_conv) # Calculate entropy as a measure of speaker balance entropy <- -sum(speakers * log(speakers)) max_entropy <- -log(1/length(speakers)) # Normalize entropy to [0, 1] range similarity <- entropy / max_entropy all_similarities[[as.character(dyad)]] <- similarity } # Calculate overall average using simple mean overall_average <- mean(unlist(all_similarities)) # Print warning about not using multilevel modeling warning("Only one observation per dyad. Using simple mean for overall average instead of multilevel modeling.") return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) } ``` ```{r participant_similarity, message=FALSE, warning=FALSE} participant_sim <- participant_sim_dyads(processed_convs) ``` ### Timing Similarity ```{r, echo=FALSE} timing_sim_dyads <- function(conversations) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] turn_lengths <- nchar(dyad_conv$processed_text) length_sim <- 1 - stats::sd(turn_lengths) / mean(turn_lengths) # Calculate rhythm similarity based on turn length differences rhythm_diffs <- diff(turn_lengths) rhythm_sim <- 1 - stats::sd(rhythm_diffs) / mean(abs(rhythm_diffs)) similarity <- mean(c(length_sim, rhythm_sim)) all_similarities[[as.character(dyad)]] <- similarity } # Calculate overall average using simple mean overall_average <- mean(unlist(all_similarities)) # Print warning about not using multilevel modeling warning("Only one observation per dyad. Using simple mean for overall average instead of multilevel modeling.") return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) } ``` ```{r timing_similarity, message=FALSE} timing_sim <- timing_sim_dyads(processed_convs) ``` ## Visualization Let's visualize the results of our similarity analyses using ggplot2. Here's an example of how to plot the topic similarity for each dyad: ```{r visualization, fig.show='hide'} topic_sim_df <- data.frame( dyad = rep(names(topic_sim$similarities_by_dyad), sapply(topic_sim$similarities_by_dyad, length)), similarity = unlist(topic_sim$similarities_by_dyad), index = unlist(lapply(topic_sim$similarities_by_dyad, seq_along)) ) ggplot(topic_sim_df, aes(x = index, y = similarity, color = dyad)) + geom_line() + geom_point() + facet_wrap(~dyad, ncol = 2) + labs(title = "Topic Similarity Across Dyads", x = "Conversation Sequence", y = "Similarity Score") + theme_minimal() + theme(legend.position = "none") ``` ```{r echo=FALSE, out.height='2000px', out.width='800px'} knitr::include_graphics("../man/figures/dyadconv_plot.jpeg") ``` ## Comparing Different Similarity Measures We can also compare different similarity measures across dyads: ```{r comparison, fig.width=10, fig.height=6} comparison_df <- data.frame( dyad = names(topic_sim$similarities_by_dyad), topic = sapply(topic_sim$similarities_by_dyad, mean), lexical = sapply(lexical_sim$similarities_by_dyad, mean), semantic = sapply(semantic_sim$similarities_by_dyad, mean), structural = unlist(structural_sim$similarities_by_dyad), stylistic = sapply(stylistic_sim$similarities_by_dyad, mean), sentiment = sapply(sentiment_sim$similarities_by_dyad, mean), participant = unlist(participant_sim$similarities_by_dyad), timing = unlist(timing_sim$similarities_by_dyad) ) comparison_long <- reshape(comparison_df, varying = list(names(comparison_df)[names(comparison_df) != "dyad"]), v.names = "similarity", timevar = "measure", times = names(comparison_df)[names(comparison_df) != "dyad"], new.row.names = 1:10000, # Adjust this if needed direction = "long") ggplot(comparison_long, aes(x = measure, y = similarity, fill = measure)) + geom_boxplot() + labs(title = "Comparison of Similarity Measures Across Dyads", x = "Similarity Measure", y = "Similarity Score") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ``` ## Conclusion This vignette demonstrates how to use the functions in `conversation_multidyads.R` to analyze conversations across multiple dyads using real-world data. These tools allow researchers to examine various aspects of conversation dynamics, including topic coherence, lexical alignment, semantic similarity, and more. The visualizations provide insights into how different similarity measures vary across dyads and how they compare to each other. This can help in identifying patterns or trends in conversational dynamics. Remember that the effectiveness of these analyses may depend on the size and nature of your dataset. Always consider the context of your conversations and the limitations of each similarity measure when interpreting the results. For further analysis, you might consider: 1. Investigating dyads with particularly high or low similarity scores. 2. Examining how similarity measures change over the course of conversations. 3. Correlating similarity measures with other variables of interest (e.g., conversation outcomes, participant characteristics).