Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.orcid.script;

import java.io.IOException;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.MetadataField;
import org.dspace.content.MetadataValue;
import org.dspace.content.authority.Choices;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.MetadataFieldService;
import org.dspace.content.service.MetadataValueService;
import org.dspace.core.Context;
import org.dspace.eperson.EPerson;
import org.dspace.eperson.factory.EPersonServiceFactory;
import org.dspace.scripts.DSpaceRunnable;
import org.dspace.utils.DSpace;

/**
* Script that assigns ORCID-based authority values to dc.contributor.author metadata
* by matching author names found in dc.identifier.orcid metadata entries.
* The script always overwrites existing authority values to keep data up-to-date.
*
* @author Matus Kasak (dspace at dataquest.sk)
*/
public class OrcidAuthorityAssign
extends DSpaceRunnable<OrcidAuthorityAssignScriptConfiguration<OrcidAuthorityAssign>> {

private static final Logger LOGGER = LogManager.getLogger();

private static final Pattern ORCID_PATTERN =
Pattern.compile("(\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX])");

private MetadataFieldService metadataFieldService;
private MetadataValueService metadataValueService;

private Context context;

@Override
public void setup() throws ParseException {
this.metadataFieldService = ContentServiceFactory.getInstance().getMetadataFieldService();
this.metadataValueService = ContentServiceFactory.getInstance().getMetadataValueService();
}

@Override
public void internalRun() throws Exception {
context = new Context();
assignCurrentUserInContext();

try {
context.turnOffAuthorisationSystem();
performAuthorityAssignment();
context.complete();
} catch (Exception e) {
handler.handleException(e);
context.abort();
} finally {
context.restoreAuthSystemState();
}
}

/**
* Set the authority of dc.contributor.author metadata
* based on matching author names in dc.identifier.orcid.
*/
private void performAuthorityAssignment() throws SQLException, IOException, AuthorizeException {
// Build the author-name-to-ORCID map from dc.identifier.orcid
MetadataField orcidField = metadataFieldService.findByElement(context, "dc", "identifier", "orcid");
if (orcidField == null) {
handler.logError("Metadata field dc.identifier.orcid not found in the registry. Aborting.");
return;
}

List<MetadataValue> orcidValues = metadataValueService.findByField(context, orcidField);
handler.logInfo("Found " + orcidValues.size() + " dc.identifier.orcid metadata entries.");

// Map: normalized author name -> ORCID ID
Map<String, String> authorNameToOrcid = new HashMap<>();

Comment thread
Kasinhou marked this conversation as resolved.
for (MetadataValue orcidMv : orcidValues) {
String rawValue = orcidMv.getValue();
if (StringUtils.isBlank(rawValue)) {
continue;
}

// Extract the ORCID ID from the value
Matcher matcher = ORCID_PATTERN.matcher(rawValue);
if (!matcher.find()) {
handler.logWarning("Could not extract ORCID ID from value: " + rawValue);
continue;
}
String orcidId = matcher.group(1);

// The author name is everything before the ORCID ID, trimmed
String authorName = rawValue.substring(0, matcher.start()).trim();
if (StringUtils.isBlank(authorName)) {
handler.logWarning("Could not extract author name from value: " + rawValue);
continue;
}

String normalizedName = normalizeAuthorName(authorName);
// If there's a duplicate author name with different ORCID
if (authorNameToOrcid.containsKey(normalizedName)
&& !authorNameToOrcid.get(normalizedName).equals(orcidId)) {
handler.logWarning("Duplicate author name '" + authorName
+ "' with different ORCIDs: " + authorNameToOrcid.get(normalizedName)
+ " vs " + orcidId + ". Using the latest.");
}
authorNameToOrcid.put(normalizedName, orcidId);
}

handler.logInfo("Built lookup map with " + authorNameToOrcid.size() + " unique author-ORCID mappings.");

if (authorNameToOrcid.isEmpty()) {
handler.logInfo("No author-ORCID mappings found. Nothing to do.");
return;
}

// Load all dc.contributor.author values
MetadataField authorField = metadataFieldService.findByElement(context, "dc", "contributor", "author");
if (authorField == null) {
handler.logError("Metadata field dc.contributor.author not found in the registry. Aborting.");
return;
}

List<MetadataValue> authorValues = metadataValueService.findByField(context, authorField);
handler.logInfo("Found " + authorValues.size() + " dc.contributor.author metadata entries to check.");

// Match and update
int updated = 0;
int batchSize = 50;

for (MetadataValue authorMv : authorValues) {
String authorValue = authorMv.getValue();
if (StringUtils.isBlank(authorValue)) {
continue;
}

String normalizedAuthor = normalizeAuthorName(authorValue);
String orcidId = authorNameToOrcid.get(normalizedAuthor);

if (orcidId != null) {
authorMv.setAuthority(orcidId);
authorMv.setConfidence(Choices.CF_ACCEPTED);
metadataValueService.update(context, authorMv, true);
updated++;

// Evict processed entities from the Hibernate session in batches
// to keep memory bounded.
if (updated % batchSize == 0) {
context.uncacheEntity(authorMv);
handler.logInfo("Progress: " + updated + " authors updated so far...");
}
}
Comment thread
Kasinhou marked this conversation as resolved.
}

context.commit();

handler.logInfo("Authority assignment complete. Updated: " + updated
+ ", Total author entries checked: " + authorValues.size());
Comment thread
Kasinhou marked this conversation as resolved.
LOGGER.info("OrcidAuthorityAssign updated {} dc.contributor.author entries.", updated);
}

/**
* Normalize an author name for matching purposes.
*/
private String normalizeAuthorName(String name) {
if (name == null) {
return "";
}
return name.trim().toLowerCase(Locale.ROOT).replace(",", "").replaceAll("\\s+", " ");
}

/**
* Assigns the current user to the context.
*/
private void assignCurrentUserInContext() throws SQLException {
UUID uuid = getEpersonIdentifier();
if (uuid != null) {
EPerson ePerson = EPersonServiceFactory.getInstance().getEPersonService().find(context, uuid);
context.setCurrentUser(ePerson);
}
}

@Override
@SuppressWarnings("unchecked")
public OrcidAuthorityAssignScriptConfiguration<OrcidAuthorityAssign> getScriptConfiguration() {
return new DSpace().getServiceManager().getServiceByName("orcid-authority-assign",
OrcidAuthorityAssignScriptConfiguration.class);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.orcid.script;

import org.apache.commons.cli.Options;
import org.dspace.scripts.configuration.ScriptConfiguration;

/**
* Script configuration for {@link OrcidAuthorityAssign}.
*
* This script assigns ORCID-based authority values to dc.contributor.author metadata
* by matching author names found in dc.identifier.orcid metadata entries.
*
* @param <T> the OrcidAuthorityAssign type
*/
public class OrcidAuthorityAssignScriptConfiguration<T extends OrcidAuthorityAssign>
extends ScriptConfiguration<T> {

private Class<T> dspaceRunnableClass;

@Override
public Class<T> getDspaceRunnableClass() {
return dspaceRunnableClass;
}

@Override
public void setDspaceRunnableClass(Class<T> dspaceRunnableClass) {
this.dspaceRunnableClass = dspaceRunnableClass;
}

@Override
public Options getOptions() {
if (options == null) {
super.options = new Options();
}
return options;
}
}
3 changes: 3 additions & 0 deletions dspace-api/src/test/data/dspaceFolder/config/local.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,9 @@ management.health.solrOai.enabled = false
researcher-profile.entity-type = Person
orcid.synchronization-enabled = true

# Use ORCID sandbox for tests
orcid.domain-url = https://sandbox.orcid.org

# Configuration settings required for Researcher Profiles
# These settings ensure "dspace.object.owner" field are indexed by Authority Control
choices.plugin.dspace.object.owner = EPersonAuthority
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@
<property name="dspaceRunnableClass" value="org.dspace.orcid.script.OrcidBulkPush"/>
</bean>

<bean id="orcid-authority-assign" class="org.dspace.orcid.script.OrcidAuthorityAssignScriptConfiguration">
<property name="description" value="Assign ORCID-based authority to dc.contributor.author metadata by matching author names in dc.identifier.orcid"/>
<property name="dspaceRunnableClass" value="org.dspace.orcid.script.OrcidAuthorityAssign"/>
</bean>

<bean id="process-cleaner" class="org.dspace.administer.ProcessCleanerCliConfiguration">
<property name="description" value="Cleanup all the old processes in the specified state"/>
<property name="dspaceRunnableClass" value="org.dspace.administer.ProcessCleanerCli"/>
Expand Down
Loading
Loading