Introduction to xmpdf

Table of Contents

Overview

{xmpdf} provides functions for getting and setting Extensibe Metadata Platform (XMP) metadata in a variety of media file formats as well as getting and setting PDF documentation info entries and bookmarks (aka outline aka table of contents).

Installation

Depending on what you’d like to do you’ll need to install some additional R packages and/or command-line tools:

Examples

Add XMP/docinfo metadata and bookmarks to a pdf

A simple example where we create a two page pdf using pdf() and then add XMP metadata, PDF documentation info metadata, and PDF bookmarks to it:

library("xmpdf")

# Create a two page pdf using `pdf()`
f <- tempfile(fileext = ".pdf")
pdf(f, onefile = TRUE)
grid::grid.text("Page 1")
grid::grid.newpage()
grid::grid.text("Page 2")
invisible(dev.off())

# See what default metadata `pdf()` created
get_docinfo(f)[[1]] |> print()
## Author: NULL
## CreationDate: 2024-03-27T23:15:55
## Creator: R
## Producer: R 4.3.3
## Title: R Graphics Output
## Subject: NULL
## Keywords: NULL
## ModDate: 2024-03-27T23:15:55
get_xmp(f)[[1]] |> print()
## No XMP metadata found
get_bookmarks(f)[[1]] |> print()
## [1] title    page     level    count    open     color    fontface
## <0 rows> (or 0-length row.names)
# Edit PDF documentation info
d <- get_docinfo(f)[[1]] |>
       update(author = "John Doe",
              subject = "A minimal document to demonstrate {xmpdf} features on",
              title = "Two Boring Pages",
              keywords = c("R", "xmpdf"))
set_docinfo(d, f)
get_docinfo(f)[[1]] |> print()
## Author: John Doe
## CreationDate: 2024-03-27T23:15:55
## Creator: R
## Producer: R 4.3.3
## Title: Two Boring Pages
## Subject: A minimal document to demonstrate {xmpdf} features on
## Keywords: R, xmpdf
## ModDate: 2024-03-27T23:15:55
# Edit XMP metadata
x <- as_xmp(d) |>
       update(attribution_url = "https://example.com/attribution",
              date_created = Sys.Date(),
              spdx_id = "CC-BY-4.0")
set_xmp(x, f)
get_xmp(f)[[1]] |> print()
##    cc:attributionName := John Doe
##    cc:attributionURL := https://example.com/attribution
##    cc:license := https://creativecommons.org/licenses/by/4.0/
##    dc:creator := John Doe
##    dc:description := A minimal document to demonstrate {xmpdf} features on
##    dc:rights := © 2024 John Doe. Some rights reserved.
##    dc:subject := R, xmpdf
##    dc:title := Two Boring Pages
##    pdf:Keywords := R, xmpdf
##    pdf:Producer := R 4.3.3
##    photoshop:Credit := John Doe
##    photoshop:DateCreated := 2024-03-27
##    x:XMPToolkit := Image::ExifTool 12.40
##    xmp:CreateDate := 2024-03-27T23:15:55
##    xmp:CreatorTool := R
##    xmp:ModifyDate := 2024-03-27T23:15:55
##    xmpRights:Marked := TRUE
##    xmpRights:UsageTerms := This work is licensed to the public under the Creative Commons
##         Attribution 4.0 International license
##         https://creativecommons.org/licenses/by/4.0/
##    xmpRights:WebStatement := https://creativecommons.org/licenses/by/4.0/
# Edit PDF bookmarks
bm <- data.frame(title = c("Page 1", "Page 2"), page = c(1, 2))
set_bookmarks(bm, f)
get_bookmarks(f)[[1]] |> print()
##    title page level count open color fontface
## 1 Page 1    1     1    NA   NA  <NA>     <NA>
## 2 Page 2    2     1    NA   NA  <NA>     <NA>

Add Google Images and Creative Commons license XMP metadata to a png image

Besides pdf files with exiftool we can also edit the XMP metadata for a large number of image formats including “gif”, “png”, “jpeg”, “tiff”, and “webp”. In particular we may be interested in setting the subset of IPTC Photo XMP metadata displayed by Google Images as well as embedding Creative Commons license XMP metadata.

library("xmpdf")
f <- tempfile(fileext = ".png")
png(f)
grid::grid.text("This is an image!")
dev.off() |> invisible()

get_xmp(f)[[1]] |> print()
## No XMP metadata found
x <- xmp(attribution_url = "https://example.com/attribution",
         creator = "John Doe",
         description = "An image caption",
         date_created = Sys.Date(),
         spdx_id = "CC-BY-4.0")
print(x, mode = "google_images", xmp_only = TRUE)
##    dc:creator := John Doe
## => dc:rights = © 2024 John Doe. Some rights reserved.
## => photoshop:Credit = John Doe
## X  plus:Licensor (not currently supported by {xmpdf})
## => xmpRights:WebStatement = https://creativecommons.org/licenses/by/4.0/
print(x, mode = "creative_commons", xmp_only = TRUE)
## => cc:attributionName = John Doe
##    cc:attributionURL := https://example.com/attribution
## => cc:license = https://creativecommons.org/licenses/by/4.0/
##    cc:morePermissions := NULL
## => dc:rights = © 2024 John Doe. Some rights reserved.
## => xmpRights:Marked = TRUE
## => xmpRights:UsageTerms = This work is licensed to the public under the Creative Commons
##         Attribution 4.0 International license
##         https://creativecommons.org/licenses/by/4.0/
## => xmpRights:WebStatement = https://creativecommons.org/licenses/by/4.0/
set_xmp(x, f)
get_xmp(f)[[1]] |> print()
##    cc:attributionName := John Doe
##    cc:attributionURL := https://example.com/attribution
##    cc:license := https://creativecommons.org/licenses/by/4.0/
##    dc:creator := John Doe
##    dc:description := An image caption
##    dc:rights := © 2024 John Doe. Some rights reserved.
##    photoshop:Credit := John Doe
##    photoshop:DateCreated := 2024-03-27
##    x:XMPToolkit := Image::ExifTool 12.40
##    xmpRights:Marked := TRUE
##    xmpRights:UsageTerms := This work is licensed to the public under the Creative Commons
##         Attribution 4.0 International license
##         https://creativecommons.org/licenses/by/4.0/
##    xmpRights:WebStatement := https://creativecommons.org/licenses/by/4.0/

Concatenate pdf files and embed concatenated bookmarks

# Create two multi-page pdfs and add bookmarks to them
f_a <- tempfile(fileext = ".pdf")
pdf(f_a, title = "Document A", onefile = TRUE)
grid::grid.text("Document A: First Page")
grid::grid.newpage()
grid::grid.text("Document A: Second Page")
dev.off() |> invisible()

f_b <- tempfile(fileext = ".pdf")
pdf(f_b, title = "Document B", onefile = TRUE)
grid::grid.text("Document B: First Page")
grid::grid.newpage()
grid::grid.text("Document B: Second Page")
dev.off() |> invisible()

bm <- data.frame(title = c("First Page", "Second Page"), page = c(1, 2))
set_bookmarks(bm, f_a)
set_bookmarks(bm, f_b)

# Concatenate pdfs to a single pdf and add their concatenated bookmarks to it
files <- c(f_a, f_b)
f_cat <- tempfile(fileext = ".pdf")
cat_pages(files, f_cat)

cat_bookmarks(get_bookmarks(files), method = "title") |>
    set_bookmarks(f_cat)

print(get_bookmarks(f_cat)[[1]])
##         title page level count open color fontface
## 1  Document A    1     1    NA   NA  <NA>     <NA>
## 2  First Page    1     2    NA   NA  <NA>     <NA>
## 3 Second Page    2     2    NA   NA  <NA>     <NA>
## 4  Document B    3     1    NA   NA  <NA>     <NA>
## 5  First Page    3     2    NA   NA  <NA>     <NA>
## 6 Second Page    4     2    NA   NA  <NA>     <NA>

Limitations by backend

{xmpdf} feature exiftool pdftk ghostscript
Get XMP metadata Yes No No
Set XMP metadata Yes No Poor: when documentation info metadata is set then as a side effect it seems the documentation info metadata will also be set as XMP metadata
Get PDF bookmarks No Okay: can only get Title, Page number, and Level No
Set PDF bookmarks No Okay: can only set Title, Page number, and Level Good: supports most bookmarks features including color and font face but only action supported is to view a particular page
Get PDF documentation info Good: may “widen” datetimes which are less than “second” precision Yes No
Set PDF documentation info Yes Good: may not handle entries with newlines in them Yes: as a side effect when documentation info metadata is set then it seems will also be set as XMP metadata
Concatenate PDF files No Yes Yes

Known limitations: