"""Utilities used in HED validation/loading using a HED schema."""
from hed.errors.error_reporter import ErrorHandler
from hed.errors.error_types import SchemaWarnings
from hed.schema import hed_schema_constants as constants
from hed.schema.hed_schema_constants import character_types
from hed.schema.hed_schema import HedSchema
[docs]def validate_schema_tag_new(hed_entry):
""" Check tag entry for capitalization and illegal characters.
Parameters:
hed_entry (HedTagEntry): A single tag entry
Returns:
list: A list of all formatting issues found in the term. Each issue is a dictionary.
"""
issues_list = []
hed_term = hed_entry.short_tag_name
# Any # terms will have already been validated as the previous entry.
if hed_term == "#":
return issues_list
if hed_term and hed_term[0] and not (hed_term[0].isdigit() or hed_term[0].isupper()):
issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION,
hed_term, char_index=0, problem_char=hed_term[0])
issues_list += validate_schema_term_new(hed_entry, hed_term)
return issues_list
[docs]def validate_schema_term_new(hed_entry, hed_term=None):
""" Check the term for invalid character issues
Parameters:
hed_entry (HedSchemaEntry): A single schema entry
hed_term (str or None): Use instead of hed_entry.name if present.
Returns:
list: A list of all formatting issues found in the term. Each issue is a dictionary.
"""
if not hed_term:
hed_term = hed_entry.name
issues_list = []
# todo: potentially optimize this someday, as most values are the same
character_set = get_allowed_characters_by_name(["name"] +
hed_entry.attributes.get("allowedCharacter", "").split(","))
indexes = get_problem_indexes(hed_term, character_set)
for char, index in indexes:
issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG,
hed_term, char_index=index, problem_char=char)
return issues_list
[docs]def validate_schema_description_new(hed_entry):
""" Check the description of the entry for invalid character issues
Parameters:
hed_entry (HedSchemaEntry): A single schema entry
Returns:
list: A list of all invalid characters found in description. Each issue is a dictionary.
"""
if not hed_entry.description:
return []
issues_list = []
character_set = get_allowed_characters_by_name(["text", "comma"])
indexes = get_problem_indexes(hed_entry.description, character_set)
# Kludge, just get short name here if we have it for error reporting
name = hed_entry.name
if hasattr(hed_entry, "short_tag_name"):
name = hed_entry.short_tag_name
for char, index in indexes:
issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC,
hed_entry.description, name, problem_char=char, char_index=index)
return issues_list
[docs]def schema_version_for_library(hed_schema, library_name):
""" Given the library name and hed schema object, return the version
Parameters:
hed_schema (HedSchema): the schema object
library_name (str or None): The library name you're interested in. "" for the standard schema.
Returns:
version_number (str): The version number of the given library name. Returns None if unknown library_name.
"""
if library_name is None:
library_name = ""
names = hed_schema.library.split(",")
versions = hed_schema.version_number.split(",")
for name, version in zip(names, versions):
if name == library_name:
return version
# Return the partnered schema version
if library_name == "" and hed_schema.with_standard:
return hed_schema.with_standard
return None
[docs]def get_allowed_characters(value_classes):
"""Returns the allowed characters in a given container of value classes
Parameters:
value_classes(list of HedSchemaEntry): A list of schema entries that should have the allowedCharacter attribute
Returns:
character_set(set): The set of all characters from the given classes
"""
# This could be pre-computed
character_set_names = []
for value_class in value_classes:
allowed_types = value_class.attributes.get(constants.HedKey.AllowedCharacter, "").split(",")
character_set_names.extend(allowed_types)
character_set = get_allowed_characters_by_name(character_set_names)
# for now, just always allow these special cases(it's validated extensively elsewhere)
character_set.update("#/")
return character_set
[docs]def get_allowed_characters_by_name(character_set_names):
"""Returns the allowed characters from a list of character set names
Note: "nonascii" is a special case "character" that can be included as well
Parameters:
character_set_names(list of str): A list of character sets to allow. See hed_schema_constants.character_types
Returns:
character_set(set): The set of all characters from the names
"""
character_set = set()
for name in character_set_names:
if name in character_types and name != "nonascii":
character_set.update(character_types[name])
else:
character_set.add(name)
return character_set
[docs]def get_problem_indexes(validation_string, character_set, index_adj=0):
"""Finds indexes with values not in character set
Parameters:
validation_string(str): The string to check characters in
character_set(set): the list of valid characters(or the value "nonascii" as a set entry)
index_adj(int): the value to adjust the reported indices by, if this isn't the start of a string.
Returns:
index_list(tuple of (str, int)): The list of problematic characters and indices
"""
if not character_set:
return []
indexes = [(char, index + index_adj) for index, char in enumerate(validation_string) if char not in character_set]
if "nonascii" in character_set:
indexes = [(char, index) for char, index in indexes if not ord(char) > 127]
return indexes