diff --git a/NEWS.md b/NEWS.md index d86e48ff..d1b910c5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ * Optimized gedcom reader, com2links for speed and memory usage, with a focus on large pedigrees * Fixed bug in gedcom reader that resulted in document records being added to the final person in the pedigree * Added more unit tests for gedcom reader and data parser +* several improvements to the GEDCOM parsing functionality, focusing on more robust and flexible event parsing, better support for different GEDCOM versions, and enhanced usability. * Optimized sliceFamilies to be more abstract, and no longer require mtdna * Created `.require_openmx()` to make it easier to use OpenMx functions without making OpenMx a dependency * Smarter string ID handling for ped2id diff --git a/R/helpReadGedcom.R b/R/helpReadGedcom.R index ace867a1..ac6efd49 100644 --- a/R/helpReadGedcom.R +++ b/R/helpReadGedcom.R @@ -18,7 +18,7 @@ initializeRecord <- function(all_var_names) { #' @param df_temp A data frame containing the columns to be combined. #' @return A data frame with the combined columns. collapseNames <- function(verbose, df_temp) { - if (verbose == TRUE) message("Combining Duplicate Columns") + if (verbose == TRUE) message("Combining Duplicate Name Columns...") if (!all(is.na(df_temp$name_given_pieces)) || !all(is.na(df_temp$name_given))) { result <- combineColumns(df_temp$name_given, df_temp$name_given_pieces) @@ -34,6 +34,71 @@ collapseNames <- function(verbose, df_temp) { df_temp } +#' Detect GEDCOM Version from File Lines +#' +#' @param lines Character vector of lines from a GEDCOM file. +#' @return A string such as `"5.5.1"`, `"7.0"`, or `"unknown"`. +#' @keywords internal +detectGedcomVersion <- function(lines) { + head_idx <- which(grepl("^0 HEAD\\b", lines))[1L] + if (is.na(head_idx)) return("unknown") + + # End of HEAD is the next level-0 record + if (head_idx >= length(lines)) return("unknown") + next_l0 <- which(grepl("^0 ", lines[(head_idx + 1L):length(lines)]))[1L] + head_end <- if (is.na(next_l0)) length(lines) else head_idx + next_l0 - 1L + head_block <- lines[head_idx:head_end] + + gedc_idx <- which(grepl("^1 GEDC\\b", head_block))[1L] + if (is.na(gedc_idx)) return("unknown") + + # Guard: if GEDC is the last line of HEAD, there is no VERS to look ahead to + if (gedc_idx >= length(head_block)) return("unknown") + + # Look ahead within HEAD block for the VERS line under GEDC + lookahead <- head_block[seq(gedc_idx + 1L, min(gedc_idx + 5L, length(head_block)))] + vers_line <- lookahead[grepl("^2 VERS\\b", lookahead)][1L] + if (is.na(vers_line)) return("unknown") + + val <- extractInfo(vers_line, "VERS") + if (is.na(val) || !nzchar(val)) return("unknown") + val +} + +#' Convert GEDCOM Latitude String to Numeric +#' +#' Converts GEDCOM-style latitude strings like `"N51.5074"` or `"S33.8688"` to +#' signed decimal degrees. Returns `NA` for `NA` or unrecognised-prefix input. +#' +#' @param x Character vector of GEDCOM latitude values. +#' @return Numeric vector of decimal degrees (positive = N, negative = S). +#' @examples +#' BGmisc:::gedcomLatToNumeric(c("N51.5074", "S33.8688", NA)) +#' @keywords internal +gedcomLatToNumeric <- function(x) { + out <- rep(NA_real_, length(x)) + ok <- !is.na(x) & (startsWith(x, "N") | startsWith(x, "S")) + out[ok] <- as.numeric(substring(x[ok], 2)) * ifelse(startsWith(x[ok], "N"), 1, -1) + out +} + +#' Convert GEDCOM Longitude String to Numeric +#' +#' Converts GEDCOM-style longitude strings like `"E151.2093"` or `"W0.1278"` to +#' signed decimal degrees. Returns `NA` for `NA` or unrecognised-prefix input. +#' +#' @param x Character vector of GEDCOM longitude values. +#' @return Numeric vector of decimal degrees (positive = E, negative = W). +#' @examples +#' BGmisc:::gedcomLonToNumeric(c("E151.2093", "W0.1278", NA)) +#' @keywords internal +gedcomLonToNumeric <- function(x) { + out <- rep(NA_real_, length(x)) + ok <- !is.na(x) & (startsWith(x, "E") | startsWith(x, "W")) + out[ok] <- as.numeric(substring(x[ok], 2)) * ifelse(startsWith(x[ok], "E"), 1, -1) + out +} + #' Combine Columns #' #' This function combines two columns, handling conflicts and merging non-conflicting data. diff --git a/R/readGedcom.R b/R/readGedcom.R index a9976f76..6f4a8c0f 100644 --- a/R/readGedcom.R +++ b/R/readGedcom.R @@ -20,12 +20,14 @@ #' including name prefix, name suffix, nickname, and married surname. #' #' Birth and death events are recognized from `BIRT` and `DEAT` tags. Event -#' details are currently parsed using fixed offsets within the individual block. -#' For birth events, the parser expects `DATE` at `i + 1`, `PLAC` at `i + 2`, -#' `LATI` at `i + 4`, and `LONG` at `i + 5`. For death events, the parser -#' expects `DATE` at `i + 1`, `PLAC` at `i + 2`, `CAUS` at `i + 3`, `LATI` at -#' `i + 4`, and `LONG` at `i + 5`. Missing elements leave the corresponding -#' output fields as `NA`. +#' details are parsed by collecting all child lines whose GEDCOM level equals +#' the event level plus one (direct children), then looking up sub-fields by +#' tag name. `DATE`, `PLAC`, and `CAUS` are matched as direct children of the +#' event. Coordinates (`LATI` and `LONG`) are searched across all descendant +#' lines, which allows them to be located whether they appear as direct children +#' (common in some GEDCOM 5.5.x exporters), under `PLAC` (standard GEDCOM +#' 5.5.1), or under a `MAP` substructure under `PLAC` (GEDCOM 7.x). Missing +#' sub-fields leave the corresponding output columns as `NA`. #' #' Attribute tags such as `OCCU`, `EDUC`, `RELI`, `CAST`, `NCHI`, `NMR`, `NATI`, #' `RESI`, `PROP`, `SSN`, `TITL`, `DSCR`, and `IDNO` are parsed directly into @@ -45,20 +47,22 @@ #' #' @param file_path Character string. Path to the GEDCOM file. #' @param verbose Logical. If `TRUE`, print progress messages. -#' @param add_parents Logical. If `TRUE`, infer `momID` and `dadID` from `FAMC` -#' and `FAMS` mappings during post-processing. #' @param remove_empty_cols Logical. If `TRUE`, drop columns that are entirely #' `NA` during post-processing. -#' @param combine_cols Logical. If `TRUE`, combine redundant name columns, such -#' as `name_given` with `name_given_pieces` and `name_surn` with -#' `name_surn_pieces`, when their values do not conflict. -#' @param parse_dates Logical. If `TRUE`, attempt to parse date columns (e.g., `birth_date`, `death_date`) into Date objects, after removing common GEDCOM date qualifiers like "ABT", "BEF", and "AFT". #' @param skinny Logical. If `TRUE`, return a slimmer data frame by dropping #' `FAMC`, `FAMS`, and columns that are entirely `NA` during post-processing. #' @param update_rate Numeric. Intended rate at which progress messages should #' be printed. Currently unused. #' @param post_process Logical. If `TRUE`, apply post-processing steps controlled #' by `add_parents`, `combine_cols`, `remove_empty_cols`, `skinny`, and `parse_dates`. +#' @param remove_empty_cols Logical indicating whether to remove columns that are entirely missing. +#' @param combine_cols Logical. If `TRUE`, combine redundant name columns, such +#' as `name_given` with `name_given_pieces` and `name_surn` with +#' `name_surn_pieces`, when their values do not conflict. +#' @param add_parents Logical. If `TRUE`, infer `momID` and `dadID` from `FAMC` +#' and `FAMS` mappings during post-processing. +#' @param parse_dates Logical. If `TRUE`, attempt to parse date columns (e.g., `birth_date`, `death_date`) into Date objects, after removing common GEDCOM date qualifiers like "ABT", "BEF", and "AFT". +#' @param clean_names Logical indicating whether to clean name columns by removing trailing slashes and squishing whitespace. #' @param ... Additional arguments. Currently unused. #' @return A data frame containing information about individuals, with the following potential columns: #' \describe{ @@ -107,13 +111,15 @@ #' readGedcom <- function(file_path, verbose = FALSE, + post_process = TRUE, add_parents = TRUE, remove_empty_cols = TRUE, combine_cols = TRUE, skinny = FALSE, parse_dates = FALSE, + clean_names = TRUE, update_rate = 1000, - post_process = TRUE, + ...) { # Ensure the file exists and read all lines. if (!file.exists(file_path)) { @@ -121,6 +127,8 @@ readGedcom <- function(file_path, } if (verbose == TRUE) message("Reading file: ", file_path) lines <- readLines(file_path) + gedcom_version <- detectGedcomVersion(lines) + if (verbose) message("Detected GEDCOM version: ", gedcom_version) total_lines <- length(lines) if (verbose == TRUE) message("File is ", total_lines, " lines long") @@ -163,6 +171,7 @@ readGedcom <- function(file_path, records <- Filter(Negate(is.null), records) if (length(records) == 0) { + # Returns NULL without a gedcom_version attribute; callers should check is.null() first. warning("No people found in file") return(NULL) } @@ -183,11 +192,13 @@ readGedcom <- function(file_path, combine_cols = combine_cols, parse_dates = parse_dates, add_parents = add_parents, + clean_names = clean_names, skinny = skinny, verbose = verbose ) } + attr(df_temp, "gedcom_version") <- gedcom_version df_temp } @@ -363,31 +374,75 @@ parseNameLine <- function(line, record) { record } +extractGedcomLevel <- function(line) { + as.integer(stringr::str_extract(line, "^\\d+")) +} + +extractEventSubBlock <- function(block, start_idx) { + event_level <- extractGedcomLevel(block[start_idx]) + n <- length(block) + # start_idx is always within bounds because it comes from a bounded loop in the caller, + # but guard defensively to avoid the descending-sequence pitfall of R's : operator. + if (start_idx >= n) return(character(0)) + end_idx <- start_idx + for (j in (start_idx + 1L):n) { + lvl <- extractGedcomLevel(block[j]) + if (is.na(lvl)) next + if (lvl <= event_level) break + end_idx <- j + } + if (end_idx == start_idx) return(character(0)) + block[(start_idx + 1L):end_idx] +} + +extractInfoFromLines <- function(lines, tag) { + pattern <- paste0("\\b", tag, "\\b") + matches <- lines[grepl(pattern, lines)] + if (length(matches) == 0L) return(NA_character_) + extractInfo(matches[1L], tag) +} + +extractCoordFromSubBlock <- function(sub_block, tag) { + # Searches all levels of the sub-block so it handles: + # GEDCOM 5.5.x: LATI/LONG as direct children of the event + # GEDCOM 5.5.x standard: LATI/LONG under PLAC (level+2) + # GEDCOM 7.x: LATI/LONG under MAP under PLAC (level+3) + pattern <- paste0("\\b", tag, "\\b") + matches <- sub_block[grepl(pattern, sub_block)] + if (length(matches) == 0L) return(NA_character_) + extractInfo(matches[1L], tag) +} + #' Process Event Lines (Birth or Death) #' #' @description Extracts event details (e.g., date, place, cause, latitude, longitude) from a block of GEDCOM lines. -#' For "birth": expect DATE on line i+1, PLAC on i+2, LATI on i+4, LONG on i+5. -#' For "death": expect DATE on line i+1, PLAC on i+2, CAUS on i+3, LATI on i+4, LONG on i+5. +#' Uses level-aware sub-block parsing so fields are looked up by tag name rather than fixed offsets. #' @param event A character string indicating the event type ("birth" or "death"). #' @param block A character vector of GEDCOM lines. #' @param i The current line index where the event tag is found. #' @param record A named list representing the individual's record. #' @param pattern_rows A list with counts of GEDCOM tag occurrences. -#' @return The updated record with parsed event information.# -# For "death": expect DATE on line i+1, PLAC on i+2, CAUS on i+3, LATI on i+4, LONG on i+5. +#' @return The updated record with parsed event information. processEventLine <- function(event, block, i, record, pattern_rows) { - n_lines <- length(block) + sub_block <- extractEventSubBlock(block, i) + if (length(sub_block) == 0L) return(record) + + event_level <- extractGedcomLevel(block[i]) + direct_children <- sub_block[ + vapply(sub_block, extractGedcomLevel, integer(1L)) == event_level + 1L + ] + if (event == "birth") { - if (i + 1 <= n_lines) record$birth_date <- extractInfo(block[i + 1], "DATE") - if (i + 2 <= n_lines) record$birth_place <- extractInfo(block[i + 2], "PLAC") - if (i + 4 <= n_lines) record$birth_lat <- extractInfo(block[i + 4], "LATI") - if (i + 5 <= n_lines) record$birth_long <- extractInfo(block[i + 5], "LONG") + record$birth_date <- extractInfoFromLines(direct_children, "DATE") + record$birth_place <- extractInfoFromLines(direct_children, "PLAC") + record$birth_lat <- extractCoordFromSubBlock(sub_block, "LATI") + record$birth_long <- extractCoordFromSubBlock(sub_block, "LONG") } else if (event == "death") { - if (i + 1 <= n_lines) record$death_date <- extractInfo(block[i + 1], "DATE") - if (i + 2 <= n_lines) record$death_place <- extractInfo(block[i + 2], "PLAC") - if (i + 3 <= n_lines) record$death_caus <- extractInfo(block[i + 3], "CAUS") - if (i + 4 <= n_lines) record$death_lat <- extractInfo(block[i + 4], "LATI") - if (i + 5 <= n_lines) record$death_long <- extractInfo(block[i + 5], "LONG") + record$death_date <- extractInfoFromLines(direct_children, "DATE") + record$death_place <- extractInfoFromLines(direct_children, "PLAC") + record$death_caus <- extractInfoFromLines(direct_children, "CAUS") + record$death_lat <- extractCoordFromSubBlock(sub_block, "LATI") + record$death_long <- extractCoordFromSubBlock(sub_block, "LONG") } record } @@ -410,8 +465,11 @@ processEventLine <- function(event, block, i, record, pattern_rows) { applyTagMappings <- function(line, record, pattern_rows, tag_mappings) { for (mapping in tag_mappings) { extractor <- if (is.null(mapping$extractor)) NULL else mapping$extractor - result <- processTag(mapping$tag, mapping$field, pattern_rows, line, record, - extractor = extractor, mode = mapping$mode + result <- processTag(mapping$tag, + mapping$field, + pattern_rows, line, record, + extractor = extractor, + mode = mapping$mode ) record <- result$vars if (result$matched) { @@ -539,20 +597,16 @@ processTag <- function(tag, #' #' @description This function optionally adds parent information, combines duplicate columns, #' and removes empty columns from the GEDCOM data frame. It is called by \code{readGedcom()} if \code{post_process = TRUE}. -#' +#' @inheritParams readGedcom #' @param df_temp A data frame produced by \code{readGedcom()}. -#' @param remove_empty_cols Logical indicating whether to remove columns that are entirely missing. -#' @param combine_cols Logical indicating whether to combine columns with duplicate values. -#' @param add_parents Logical indicating whether to add parent information. -#' @param parse_dates Logical indicating whether to parse date columns into Date objects. -#' @param skinny Logical indicating whether to slim down the data frame. #' @param verbose Logical indicating whether to print progress messages. #' @return The post-processed data frame. postProcessGedcom <- function(df_temp, remove_empty_cols = TRUE, combine_cols = TRUE, - parse_dates = FALSE, add_parents = TRUE, + parse_dates = FALSE, + clean_names = TRUE, skinny = TRUE, verbose = FALSE) { if (add_parents == TRUE) { @@ -568,27 +622,52 @@ postProcessGedcom <- function(df_temp, } if (parse_dates == TRUE) { date_cols <- c("birth_date", "death_date") - if (verbose == TRUE) message("Parsing date columns: ", paste(date_cols[date_cols %in% colnames(df_temp)], collapse = ", ")) - # GEDCOM date qualifiers like "ABT", "BEF", "AFT" can be present in date strings. We can remove them before parsing. + calendar_escape_regex <- "@#D[A-Z ]+@\\s*" date_qualifier_regex <- "\\b(?:[aA][bBfF][tT]|[bB][eE][tTfF])\\.?\\b\\s*" - if (verbose == TRUE && any(sapply(df_temp[date_cols], function(col) any(grepl(date_qualifier_regex, col, perl = TRUE)))) - ) { - message("Found date qualifiers in date columns. They will be removed before parsing.") + if (verbose == TRUE) { + message("Parsing date columns: ", paste(date_cols[date_cols %in% colnames(df_temp)], collapse = ", ")) + } + + if (verbose == TRUE && any(date_cols %in% colnames(df_temp))) { + has_qualifiers <- any(sapply( + df_temp[date_cols[date_cols %in% colnames(df_temp)]], + function(col) any(grepl(date_qualifier_regex, col, perl = TRUE)) + )) + if (has_qualifiers==TRUE) { + message("Found date qualifiers in date columns. They will be removed before parsing.") + } } # only parse date columns that are present in the data frame - if (any(date_cols %in% colnames(df_temp))) { - df_temp[date_cols] <- lapply(df_temp[date_cols], function(x) { + present_date_cols <- date_cols[date_cols %in% colnames(df_temp)] + if (length(present_date_cols) > 0) { + df_temp[present_date_cols] <- lapply(df_temp[present_date_cols], function(x) { if (is.character(x)) { + x <- stringr::str_replace_all(x, calendar_escape_regex, "") x <- stringr::str_replace_all(x, date_qualifier_regex, "") - as.Date(x, format = "%d %b %Y") + as.Date(stringr::str_trim(x), format = "%d %b %Y") } else { x } }) } } + if (clean_names == TRUE) { + if (verbose == TRUE) message("Cleaning column names") + name_cols <- grep("^name", colnames(df_temp), value = TRUE) + if (verbose == TRUE && any(name_cols %in% colnames(df_temp))) { + message("Cleaning name columns: ", paste(name_cols, collapse = ", ")) + } + df_temp[name_cols] <- lapply(df_temp[name_cols], function(x) { + if (is.character(x)) { # remove / at end of names if present, and squish whitespace + stringr::str_squish(stringr::str_replace(x,"/+$", "")) + } else { + x + } + } + ) + } if (skinny == TRUE) { if (verbose == TRUE) message("Slimming down the data frame") # Remove raw family relationship columns diff --git a/R/sysdata.rda b/R/sysdata.rda new file mode 100644 index 00000000..3101c76d Binary files /dev/null and b/R/sysdata.rda differ diff --git a/_pkgdown.yml b/_pkgdown.yml index fbd085d6..5c962523 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,3 +1,16 @@ url: https://r-computing-lab.github.io/BGmisc template: bootstrap: 5 + +reference: + - title: GEDCOM I/O + contents: + - readGedcom + - readGed + - readgedcom + - gedcomLatToNumeric + - gedcomLonToNumeric + + - title: Other functions + contents: + - matches(".*") diff --git a/data-raw/df_royal92.R b/data-raw/df_royal92.R index f0f55a4f..008f65a5 100644 --- a/data-raw/df_royal92.R +++ b/data-raw/df_royal92.R @@ -3485,8 +3485,8 @@ if (FALSE) { code_male = "M", code_female = "F", add_phantoms = TRUE, - ped_packed=T, - ped_align = T + ped_packed=TRUE, + ped_align = TRUE ), tooltip_columns = c("personID", "name", "title", "birth_date", "death_date") diff --git a/data-raw/gedcom_spec.R b/data-raw/gedcom_spec.R new file mode 100644 index 00000000..1276903f --- /dev/null +++ b/data-raw/gedcom_spec.R @@ -0,0 +1,25 @@ +# Downloads the FamilySearch GEDCOM 7 spec TSVs and saves them as sysdata. +# Run once with: source("data-raw/gedcom_spec.R") + +base_url <- "https://raw.githubusercontent.com/FamilySearch/GEDCOM/main/extracted-files/" + +gedcom_substructures <- utils::read.table( + paste0(base_url, "substructures.tsv"), + sep = "\t", header = TRUE, stringsAsFactors = FALSE, quote = "" +) +gedcom_payloads <- utils::read.table( + paste0(base_url, "payloads.tsv"), + sep = "\t", header = TRUE, stringsAsFactors = FALSE, quote = "" +) +gedcom_enumerations <- utils::read.table( + paste0(base_url, "enumerations.tsv"), + sep = "\t", header = TRUE, stringsAsFactors = FALSE, quote = "" +) + +usethis::use_data( + gedcom_substructures, + gedcom_payloads, + gedcom_enumerations, + internal = TRUE, + overwrite = TRUE +) diff --git a/man/detectGedcomVersion.Rd b/man/detectGedcomVersion.Rd new file mode 100644 index 00000000..d218f287 --- /dev/null +++ b/man/detectGedcomVersion.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpReadGedcom.R +\name{detectGedcomVersion} +\alias{detectGedcomVersion} +\title{Detect GEDCOM Version from File Lines} +\usage{ +detectGedcomVersion(lines) +} +\arguments{ +\item{lines}{Character vector of lines from a GEDCOM file.} +} +\value{ +A string such as `"5.5.1"`, `"7.0"`, or `"unknown"`. +} +\description{ +Detect GEDCOM Version from File Lines +} +\keyword{internal} diff --git a/man/gedcomLatToNumeric.Rd b/man/gedcomLatToNumeric.Rd new file mode 100644 index 00000000..b81ebced --- /dev/null +++ b/man/gedcomLatToNumeric.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpReadGedcom.R +\name{gedcomLatToNumeric} +\alias{gedcomLatToNumeric} +\title{Convert GEDCOM Latitude String to Numeric} +\usage{ +gedcomLatToNumeric(x) +} +\arguments{ +\item{x}{Character vector of GEDCOM latitude values.} +} +\value{ +Numeric vector of decimal degrees (positive = N, negative = S). +} +\description{ +Converts GEDCOM-style latitude strings like `"N51.5074"` or `"S33.8688"` to +signed decimal degrees. Returns `NA` for `NA` or unrecognised-prefix input. +} +\examples{ +BGmisc:::gedcomLatToNumeric(c("N51.5074", "S33.8688", NA)) +} +\keyword{internal} diff --git a/man/gedcomLonToNumeric.Rd b/man/gedcomLonToNumeric.Rd new file mode 100644 index 00000000..7914d7c0 --- /dev/null +++ b/man/gedcomLonToNumeric.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpReadGedcom.R +\name{gedcomLonToNumeric} +\alias{gedcomLonToNumeric} +\title{Convert GEDCOM Longitude String to Numeric} +\usage{ +gedcomLonToNumeric(x) +} +\arguments{ +\item{x}{Character vector of GEDCOM longitude values.} +} +\value{ +Numeric vector of decimal degrees (positive = E, negative = W). +} +\description{ +Converts GEDCOM-style longitude strings like `"E151.2093"` or `"W0.1278"` to +signed decimal degrees. Returns `NA` for `NA` or unrecognised-prefix input. +} +\examples{ +BGmisc:::gedcomLonToNumeric(c("E151.2093", "W0.1278", NA)) +} +\keyword{internal} diff --git a/man/postProcessGedcom.Rd b/man/postProcessGedcom.Rd index f1449735..c91179e8 100644 --- a/man/postProcessGedcom.Rd +++ b/man/postProcessGedcom.Rd @@ -8,8 +8,9 @@ postProcessGedcom( df_temp, remove_empty_cols = TRUE, combine_cols = TRUE, - parse_dates = FALSE, add_parents = TRUE, + parse_dates = FALSE, + clean_names = TRUE, skinny = TRUE, verbose = FALSE ) @@ -19,13 +20,19 @@ postProcessGedcom( \item{remove_empty_cols}{Logical indicating whether to remove columns that are entirely missing.} -\item{combine_cols}{Logical indicating whether to combine columns with duplicate values.} +\item{combine_cols}{Logical. If `TRUE`, combine redundant name columns, such +as `name_given` with `name_given_pieces` and `name_surn` with +`name_surn_pieces`, when their values do not conflict.} + +\item{add_parents}{Logical. If `TRUE`, infer `momID` and `dadID` from `FAMC` +and `FAMS` mappings during post-processing.} -\item{parse_dates}{Logical indicating whether to parse date columns into Date objects.} +\item{parse_dates}{Logical. If `TRUE`, attempt to parse date columns (e.g., `birth_date`, `death_date`) into Date objects, after removing common GEDCOM date qualifiers like "ABT", "BEF", and "AFT".} -\item{add_parents}{Logical indicating whether to add parent information.} +\item{clean_names}{Logical indicating whether to clean name columns by removing trailing slashes and squishing whitespace.} -\item{skinny}{Logical indicating whether to slim down the data frame.} +\item{skinny}{Logical. If `TRUE`, return a slimmer data frame by dropping +`FAMC`, `FAMS`, and columns that are entirely `NA` during post-processing.} \item{verbose}{Logical indicating whether to print progress messages.} } diff --git a/man/processEventLine.Rd b/man/processEventLine.Rd index d4cff3d3..40bb9417 100644 --- a/man/processEventLine.Rd +++ b/man/processEventLine.Rd @@ -18,10 +18,9 @@ processEventLine(event, block, i, record, pattern_rows) \item{pattern_rows}{A list with counts of GEDCOM tag occurrences.} } \value{ -The updated record with parsed event information.# +The updated record with parsed event information. } \description{ Extracts event details (e.g., date, place, cause, latitude, longitude) from a block of GEDCOM lines. -For "birth": expect DATE on line i+1, PLAC on i+2, LATI on i+4, LONG on i+5. -For "death": expect DATE on line i+1, PLAC on i+2, CAUS on i+3, LATI on i+4, LONG on i+5. +Uses level-aware sub-block parsing so fields are looked up by tag name rather than fixed offsets. } diff --git a/man/readGedcom.Rd b/man/readGedcom.Rd index 2350716b..63cd9079 100644 --- a/man/readGedcom.Rd +++ b/man/readGedcom.Rd @@ -9,39 +9,42 @@ readGedcom( file_path, verbose = FALSE, + post_process = TRUE, add_parents = TRUE, remove_empty_cols = TRUE, combine_cols = TRUE, skinny = FALSE, parse_dates = FALSE, + clean_names = TRUE, update_rate = 1000, - post_process = TRUE, ... ) readGed( file_path, verbose = FALSE, + post_process = TRUE, add_parents = TRUE, remove_empty_cols = TRUE, combine_cols = TRUE, skinny = FALSE, parse_dates = FALSE, + clean_names = TRUE, update_rate = 1000, - post_process = TRUE, ... ) readgedcom( file_path, verbose = FALSE, + post_process = TRUE, add_parents = TRUE, remove_empty_cols = TRUE, combine_cols = TRUE, skinny = FALSE, parse_dates = FALSE, + clean_names = TRUE, update_rate = 1000, - post_process = TRUE, ... ) } @@ -50,11 +53,13 @@ readgedcom( \item{verbose}{Logical. If `TRUE`, print progress messages.} +\item{post_process}{Logical. If `TRUE`, apply post-processing steps controlled +by `add_parents`, `combine_cols`, `remove_empty_cols`, `skinny`, and `parse_dates`.} + \item{add_parents}{Logical. If `TRUE`, infer `momID` and `dadID` from `FAMC` and `FAMS` mappings during post-processing.} -\item{remove_empty_cols}{Logical. If `TRUE`, drop columns that are entirely -`NA` during post-processing.} +\item{remove_empty_cols}{Logical indicating whether to remove columns that are entirely missing.} \item{combine_cols}{Logical. If `TRUE`, combine redundant name columns, such as `name_given` with `name_given_pieces` and `name_surn` with @@ -65,12 +70,11 @@ as `name_given` with `name_given_pieces` and `name_surn` with \item{parse_dates}{Logical. If `TRUE`, attempt to parse date columns (e.g., `birth_date`, `death_date`) into Date objects, after removing common GEDCOM date qualifiers like "ABT", "BEF", and "AFT".} +\item{clean_names}{Logical indicating whether to clean name columns by removing trailing slashes and squishing whitespace.} + \item{update_rate}{Numeric. Intended rate at which progress messages should be printed. Currently unused.} -\item{post_process}{Logical. If `TRUE`, apply post-processing steps controlled -by `add_parents`, `combine_cols`, `remove_empty_cols`, `skinny`, and `parse_dates`.} - \item{...}{Additional arguments. Currently unused.} } \value{ @@ -139,12 +143,14 @@ cleaned full name. Additional name components are parsed when present, including name prefix, name suffix, nickname, and married surname. Birth and death events are recognized from `BIRT` and `DEAT` tags. Event -details are currently parsed using fixed offsets within the individual block. -For birth events, the parser expects `DATE` at `i + 1`, `PLAC` at `i + 2`, -`LATI` at `i + 4`, and `LONG` at `i + 5`. For death events, the parser -expects `DATE` at `i + 1`, `PLAC` at `i + 2`, `CAUS` at `i + 3`, `LATI` at -`i + 4`, and `LONG` at `i + 5`. Missing elements leave the corresponding -output fields as `NA`. +details are parsed by collecting all child lines whose GEDCOM level equals +the event level plus one (direct children), then looking up sub-fields by +tag name. `DATE`, `PLAC`, and `CAUS` are matched as direct children of the +event. Coordinates (`LATI` and `LONG`) are searched across all descendant +lines, which allows them to be located whether they appear as direct children +(common in some GEDCOM 5.5.x exporters), under `PLAC` (standard GEDCOM +5.5.1), or under a `MAP` substructure under `PLAC` (GEDCOM 7.x). Missing +sub-fields leave the corresponding output columns as `NA`. Attribute tags such as `OCCU`, `EDUC`, `RELI`, `CAST`, `NCHI`, `NMR`, `NATI`, `RESI`, `PROP`, `SSN`, `TITL`, `DSCR`, and `IDNO` are parsed directly into diff --git a/tests/testthat/test-readGedcom.R b/tests/testthat/test-readGedcom.R index ab11e915..a8102611 100644 --- a/tests/testthat/test-readGedcom.R +++ b/tests/testthat/test-readGedcom.R @@ -80,14 +80,14 @@ test_that("readGedcom reads and parses a GEDCOM file correctly", { expect_equal(nrow(df), 2) expect_equal(df$name_given[1], "John") expect_equal(df$name_surn[1], "Doe") - expect_equal(df$name[1], "John Doe/") + expect_equal(df$name[1], "John Doe") expect_equal(df$sex[1], "M") expect_equal(df$birth_date[1], "1 JAN 1900") expect_equal(df$birth_place[1], "Someplace") expect_equal(df$attribute_children[1], NA_character_) expect_equal(df$name_given[2], "Jane") expect_equal(df$name_surn[2], "Smith") - expect_equal(df$name[2], "Jane Smith/") + expect_equal(df$name[2], "Jane Smith") expect_equal(df$sex[2], "F") expect_equal(df$birth_date[2], "2 FEB 1910") expect_equal(df$birth_place[2], "Anotherplace") @@ -213,7 +213,7 @@ test_that("readGedcom parses death event correctly", { temp_file <- tempfile(fileext = ".ged") writeLines(gedcom_content, temp_file) - df <- readGedcom(temp_file, verbose = TRUE) + df <- readGedcom(temp_file, verbose = TRUE, clean_names = FALSE) expect_true("death_date" %in% colnames(df)) expect_true("death_place" %in% colnames(df)) @@ -243,6 +243,8 @@ test_that("readGedcom parses death event correctly", { row.names(df) <- NULL row.names(df_leg) <- NULL df_leg <- dplyr::rename(df_leg, personID = id) + # Strip the gedcom_version attribute added by readGedcom before comparing + attr(df, "gedcom_version") <- NULL expect_equal(df_leg, df) unlink(temp_file) @@ -352,7 +354,7 @@ test_that("readGedcom parses all supported name component tags", { expect_equal(df$name_nick[1], "Jack") expect_equal(df$name_surn_pieces[1], "Doe") expect_equal(df$name_nsfx[1], "Jr.") - expect_equal(df$name_marriedsurn[1], "John Quincy /MarriedDoe/") + expect_equal(df$name_marriedsurn[1], "John Quincy /MarriedDoe") unlink(temp_file) }) @@ -575,6 +577,121 @@ test_that("mapFAMS2parents warns and returns NULL when required columns are miss expect_null(out) }) +birt_no_date_content <- c( + "0 @I1@ INDI", + "1 NAME Alice /Jones/", + "1 SEX F", + "1 BIRT", + "2 PLAC Springfield", + "2 LATI N39.7817", + "2 LONG W89.6501" +) + +birt_no_plac_content <- c( + "0 @I1@ INDI", + "1 NAME Bob /Smith/", + "1 SEX M", + "1 BIRT", + "2 DATE 15 MAR 1955" +) + +birt_reordered_content <- c( + "0 @I1@ INDI", + "1 NAME Carol /Lee/", + "1 SEX F", + "1 BIRT", + "2 LATI N40.7128", + "2 LONG W74.0060", + "2 PLAC New York", + "2 DATE 4 JUL 1976" +) + +test_that("processEventLine handles missing DATE gracefully", { + temp_file <- tempfile(fileext = ".ged") + writeLines(birt_no_date_content, temp_file) + df <- readGedcom(temp_file, remove_empty_cols = FALSE) + expect_true(is.na(df$birth_date[1])) + expect_equal(df$birth_place[1], "Springfield") + expect_equal(df$birth_lat[1], "N39.7817") + expect_equal(df$birth_long[1], "W89.6501") + unlink(temp_file) +}) + +test_that("processEventLine handles missing PLAC gracefully", { + temp_file <- tempfile(fileext = ".ged") + writeLines(birt_no_plac_content, temp_file) + df <- readGedcom(temp_file, remove_empty_cols = FALSE) + expect_equal(df$birth_date[1], "15 MAR 1955") + expect_true(is.na(df$birth_place[1])) + unlink(temp_file) +}) + +test_that("processEventLine handles reordered subfields correctly", { + temp_file <- tempfile(fileext = ".ged") + writeLines(birt_reordered_content, temp_file) + df <- readGedcom(temp_file) + expect_equal(df$birth_date[1], "4 JUL 1976") + expect_equal(df$birth_place[1], "New York") + expect_equal(df$birth_lat[1], "N40.7128") + expect_equal(df$birth_long[1], "W74.0060") + unlink(temp_file) +}) + +gedcom55_header <- c( + "0 HEAD", + "1 GEDC", + "2 VERS 5.5.1", + "2 FORM LINEAGE-LINKED", + "1 CHAR UTF-8", + "0 @I1@ INDI", + "1 NAME Test /Person/", + "1 SEX M" +) + +gedcom7_header <- c( + "0 HEAD", + "1 GEDC", + "2 VERS 7.0", + "1 CHAR UTF-8", + "0 @I1@ INDI", + "1 NAME Test /Person/", + "1 SEX M" +) + +test_that("detectGedcomVersion returns correct version string", { + expect_equal(BGmisc:::detectGedcomVersion(gedcom55_header), "5.5.1") + expect_equal(BGmisc:::detectGedcomVersion(gedcom7_header), "7.0") + expect_equal(BGmisc:::detectGedcomVersion(c("0 @I1@ INDI", "1 NAME No /Head/")), "unknown") +}) + +test_that("readGedcom attaches gedcom_version attribute", { + temp_file <- tempfile(fileext = ".ged") + writeLines(gedcom55_header, temp_file) + df <- readGedcom(temp_file) + expect_equal(attr(df, "gedcom_version"), "5.5.1") + unlink(temp_file) +}) + +test_that("detectGedcomVersion returns unknown when GEDC is present but VERS is missing", { + lines <- c( + "0 HEAD", + "1 GEDC", + "1 CHAR UTF-8", + "0 @I1@ INDI", + "1 NAME Test /Person/", + "1 SEX M" + ) + expect_equal(BGmisc:::detectGedcomVersion(lines), "unknown") +}) + +test_that("readGedcom attaches gedcom_version attribute with post_process = FALSE", { + temp_file <- tempfile(fileext = ".ged") + writeLines(gedcom55_header, temp_file) + df <- readGedcom(temp_file, post_process = FALSE) + expect_equal(attr(df, "gedcom_version"), "5.5.1") + unlink(temp_file) +}) + test_that("readGed and readgedcom aliases return the same output as readGedcom", { gedcom_content <- c( "0 @I1@ INDI", @@ -598,3 +715,78 @@ test_that("readGed and readgedcom aliases return the same output as readGedcom", unlink(temp_file) }) + +gedcom7_map_content <- c( + "0 HEAD", + "1 GEDC", + "2 VERS 7.0", + "0 @I1@ INDI", + "1 NAME Test /Person/", + "1 SEX F", + "1 BIRT", + "2 DATE 1 JAN 2000", + "2 PLAC London, England", + "3 MAP", + "4 LATI N51.5074", + "4 LONG W0.1278", + "1 DEAT", + "2 DATE 31 DEC 2080", + "2 PLAC Edinburgh, Scotland", + "3 MAP", + "4 LATI N55.9533", + "4 LONG W3.1883" +) + +gedcom_calendar_escape_content <- c( + "0 HEAD", + "1 GEDC", + "2 VERS 5.5", + "0 @I1@ INDI", + "1 NAME Old /Style/", + "1 SEX M", + "1 BIRT", + "2 DATE @#DGREGORIAN@ 15 JUL 1823" +) + +test_that("readGedcom parses GEDCOM 7.x MAP coordinate structure", { + temp_file <- tempfile(fileext = ".ged") + writeLines(gedcom7_map_content, temp_file) + df <- readGedcom(temp_file) + expect_equal(attr(df, "gedcom_version"), "7.0") + expect_equal(df$birth_lat[1], "N51.5074") + expect_equal(df$birth_long[1], "W0.1278") + expect_equal(df$death_lat[1], "N55.9533") + expect_equal(df$death_long[1], "W3.1883") + unlink(temp_file) +}) + +test_that("parse_dates strips GEDCOM 5.5 calendar escape before parsing", { + temp_file <- tempfile(fileext = ".ged") + writeLines(gedcom_calendar_escape_content, temp_file) + df <- readGedcom(temp_file, parse_dates = TRUE) + expect_false(is.na(df$birth_date[1])) + expect_equal(format(df$birth_date[1], "%d %b %Y"), "15 Jul 1823") + unlink(temp_file) +}) + +test_that("gedcomLatToNumeric converts N/S notation correctly", { + expect_equal(gedcomLatToNumeric("N51.5074"), 51.5074) + expect_equal(gedcomLatToNumeric("S33.8688"), -33.8688) + expect_true(is.na(gedcomLatToNumeric(NA_character_))) +}) + +test_that("gedcomLonToNumeric converts E/W notation correctly", { + expect_equal(gedcomLonToNumeric("E151.2093"), 151.2093) + expect_equal(gedcomLonToNumeric("W0.1278"), -0.1278) + expect_true(is.na(gedcomLonToNumeric(NA_character_))) +}) + +test_that("gedcomLatToNumeric returns NA for unrecognised prefix", { + expect_true(is.na(gedcomLatToNumeric("12.34"))) + expect_true(is.na(gedcomLatToNumeric(""))) +}) + +test_that("gedcomLonToNumeric returns NA for unrecognised prefix", { + expect_true(is.na(gedcomLonToNumeric("12.34"))) + expect_true(is.na(gedcomLonToNumeric(""))) +}) diff --git a/tests/testthat/test-readGedcomlegacy.R b/tests/testthat/test-readGedcomlegacy.R index ea7a505c..24ee0fe1 100644 --- a/tests/testthat/test-readGedcomlegacy.R +++ b/tests/testthat/test-readGedcomlegacy.R @@ -14,7 +14,9 @@ test_that("readGedcom parses death event correctly for legacy", { temp_file <- tempfile(fileext = ".ged") writeLines(gedcom_content, temp_file) - df <- readGedcom(temp_file, verbose = TRUE) + df <- readGedcom(temp_file, verbose = TRUE, + clean_names = FALSE + ) df_leg <- .readGedcom.legacy(temp_file, verbose = TRUE) expect_true("death_date" %in% colnames(df_leg)) @@ -32,6 +34,8 @@ test_that("readGedcom parses death event correctly for legacy", { row.names(df) <- NULL row.names(df_leg) <- NULL df_leg <- dplyr::rename(df_leg, personID = id) + # Strip the gedcom_version attribute added by readGedcom before comparing + attr(df, "gedcom_version") <- NULL expect_equal(df_leg, df) unlink(temp_file)