semantify3 API Documentation

semantify³ - Extract knowledge graph ready triples from human-readable annotations wherever possible.

Syntax matters!

Created: 2025-01-29 Authors: Wolfgang Fahl, Tim Holzheim Repository: https://github.com/BITPlan/semantify3

`extractor`

Command-line interface for semantify³.

# 🌐🕸
extractor:
  isA: PythonModule
  author: Wolfgang Fahl
  createdAt: 2025-11-29
  purpose: extraction of relevant markup snippets for semantify³.

`Extractor`

Extract semantic annotation markup from files.

Source code in sem3/extractor.py

class Extractor:
    """Extract semantic annotation markup from files."""

    def __init__(self, marker: str = "🌐🕸", debug: bool = False):
        """constructor."""
        self.marker = marker
        self.debug = debug
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG if debug else logging.INFO)

    def log(self, msg: str):
        if self.debug:
            self.logger.debug(msg)

    def extract_from_file(self, filepath: str) -> List[Markup]:
        """Extract markup snippets from a single file.

        Args:
            filepath: Path to the file to extract from.

        Returns:
            List[Markup]: List of extracted markup snippets.
        """
        markups = []
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
            markups = self.extract_from_text(content, source_path=filepath)
        except (IOError, UnicodeDecodeError) as e:
            self.logger.warning(f"Error reading {filepath}: {e}")
            markups = []
        return markups

    def extract_from_text(
        self, text: str, source_path: Optional[str] = None
    ) -> List[Markup]:
        """Extract all semantic markup snippets from text.

        Args:
            text: The source text to extract from.
            source_path: Optional file path for location tracking.

        Returns:
            List[Markup]: List of extracted markup snippets.
        """
        markups = []

        # Pattern to match code blocks with yaml or sidif
        pattern = re.compile(
            r"```(yaml|sidif)\s*\n"  # Opening fence with language
            r"(.*?)"  # Content (non-greedy)
            r"\n\s*```",  # Closing fence
            re.DOTALL,
        )

        for match in pattern.finditer(text):
            lang = match.group(1)
            raw_content = match.group(2)

            # Find first non-empty line
            lines = raw_content.split("\n")
            first_content_idx = None

            for idx, line in enumerate(lines):
                if line.strip():
                    first_content_idx = idx
                    break

            if first_content_idx is None:
                continue

            first_line = lines[first_content_idx].strip()

            # Check for marker
            if self.marker not in first_line:
                continue

            # Extract content after marker line
            content_lines = lines[first_content_idx + 1 :]
            code = "\n".join(content_lines).strip()

            if not code:
                continue

            # Calculate source line (1-based)
            line_num = text[: match.start()].count("\n") + 1

            source = ""
            if source_path:
                source = f"{source_path}:{line_num}"

            markup = Markup(lang=lang, code=code, source=source)
            markups.append(markup)

        if self.debug and len(markups) > 0:
            self.log(f"Found {len(markups)} snippets in {source_path}")

        return markups

    def extract_from_glob(self, pattern: str) -> List[Markup]:
        """Extract markup snippets from files matching a glob pattern.

        Args:
            pattern: Glob pattern to match files (supports **).

        Returns:
            List[Markup]: All markup snippets from matching files.
        """
        all_markups = []

        files = glob.glob(pattern, recursive=True)
        self.log(f"Glob pattern '{pattern}' found {len(files)} files")

        for filepath in files:
            markups = self.extract_from_file(filepath)
            all_markups.extend(markups)

        return all_markups

    def extract_from_glob_list(self, patterns: List[str]) -> List[Markup]:
        """Extract markup snippets from files matching multiple glob patterns.

        Args:
            patterns: List of glob patterns to match files.

        Returns:
            List[Markup]: All markup snippets from matching files.
        """
        all_markups = []

        self.log(f"Processing {len(patterns)} glob patterns")

        for pattern in patterns:
            self.log(f"Checking pattern: {pattern}")
            markups = self.extract_from_glob(pattern)
            all_markups.extend(markups)

        return all_markups

`init(marker='🌐🕸', debug=False)`

constructor.

Source code in sem3/extractor.py

def __init__(self, marker: str = "🌐🕸", debug: bool = False):
    """constructor."""
    self.marker = marker
    self.debug = debug
    self.logger = logging.getLogger(__name__)
    self.logger.setLevel(logging.DEBUG if debug else logging.INFO)

`extract_from_file(filepath)`

Extract markup snippets from a single file.

Parameters:

Name	Type	Description	Default
`filepath`	`str`	Path to the file to extract from.	required

Returns:

Type	Description
`List[Markup]`	List[Markup]: List of extracted markup snippets.

Source code in sem3/extractor.py

def extract_from_file(self, filepath: str) -> List[Markup]:
    """Extract markup snippets from a single file.

    Args:
        filepath: Path to the file to extract from.

    Returns:
        List[Markup]: List of extracted markup snippets.
    """
    markups = []
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()
        markups = self.extract_from_text(content, source_path=filepath)
    except (IOError, UnicodeDecodeError) as e:
        self.logger.warning(f"Error reading {filepath}: {e}")
        markups = []
    return markups

`extract_from_glob(pattern)`

Extract markup snippets from files matching a glob pattern.

Parameters:

Name	Type	Description	Default
`pattern`	`str`	Glob pattern to match files (supports **).	required

Returns:

Type	Description
`List[Markup]`	List[Markup]: All markup snippets from matching files.

Source code in sem3/extractor.py

def extract_from_glob(self, pattern: str) -> List[Markup]:
    """Extract markup snippets from files matching a glob pattern.

    Args:
        pattern: Glob pattern to match files (supports **).

    Returns:
        List[Markup]: All markup snippets from matching files.
    """
    all_markups = []

    files = glob.glob(pattern, recursive=True)
    self.log(f"Glob pattern '{pattern}' found {len(files)} files")

    for filepath in files:
        markups = self.extract_from_file(filepath)
        all_markups.extend(markups)

    return all_markups

`extract_from_glob_list(patterns)`

Extract markup snippets from files matching multiple glob patterns.

Parameters:

Name	Type	Description	Default
`patterns`	`List[str]`	List of glob patterns to match files.	required

Returns:

Type	Description
`List[Markup]`	List[Markup]: All markup snippets from matching files.

Source code in sem3/extractor.py

def extract_from_glob_list(self, patterns: List[str]) -> List[Markup]:
    """Extract markup snippets from files matching multiple glob patterns.

    Args:
        patterns: List of glob patterns to match files.

    Returns:
        List[Markup]: All markup snippets from matching files.
    """
    all_markups = []

    self.log(f"Processing {len(patterns)} glob patterns")

    for pattern in patterns:
        self.log(f"Checking pattern: {pattern}")
        markups = self.extract_from_glob(pattern)
        all_markups.extend(markups)

    return all_markups

`extract_from_text(text, source_path=None)`

Extract all semantic markup snippets from text.

Parameters:

Name	Type	Description	Default
`text`	`str`	The source text to extract from.	required
`source_path`	`Optional[str]`	Optional file path for location tracking.	`None`

Returns:

Type	Description
`List[Markup]`	List[Markup]: List of extracted markup snippets.

Source code in sem3/extractor.py

def extract_from_text(
    self, text: str, source_path: Optional[str] = None
) -> List[Markup]:
    """Extract all semantic markup snippets from text.

    Args:
        text: The source text to extract from.
        source_path: Optional file path for location tracking.

    Returns:
        List[Markup]: List of extracted markup snippets.
    """
    markups = []

    # Pattern to match code blocks with yaml or sidif
    pattern = re.compile(
        r"```(yaml|sidif)\s*\n"  # Opening fence with language
        r"(.*?)"  # Content (non-greedy)
        r"\n\s*```",  # Closing fence
        re.DOTALL,
    )

    for match in pattern.finditer(text):
        lang = match.group(1)
        raw_content = match.group(2)

        # Find first non-empty line
        lines = raw_content.split("\n")
        first_content_idx = None

        for idx, line in enumerate(lines):
            if line.strip():
                first_content_idx = idx
                break

        if first_content_idx is None:
            continue

        first_line = lines[first_content_idx].strip()

        # Check for marker
        if self.marker not in first_line:
            continue

        # Extract content after marker line
        content_lines = lines[first_content_idx + 1 :]
        code = "\n".join(content_lines).strip()

        if not code:
            continue

        # Calculate source line (1-based)
        line_num = text[: match.start()].count("\n") + 1

        source = ""
        if source_path:
            source = f"{source_path}:{line_num}"

        markup = Markup(lang=lang, code=code, source=source)
        markups.append(markup)

    if self.debug and len(markups) > 0:
        self.log(f"Found {len(markups)} snippets in {source_path}")

    return markups

`Markup` `dataclass`

A single markup.

Source code in sem3/extractor.py

@lod_storable
@dataclass
class Markup:
    """A single markup."""

    lang: str
    code: str
    source: str

`sem3_cmd`

Command-line interface for semantify³.

🌐🕸
sem3_cmd:
  isA: PythonModule
  author: Wolfgang Fahl
  createdAt: 2025-11-29
  purpose: Command-line interface for semantify³.

`Semantify3Cmd`

Bases: BaseCmd

Command line interface for semantify³.

Source code in sem3/sem3_cmd.py

class Semantify3Cmd(BaseCmd):
    """Command line interface for semantify³."""

    def __init__(self):
        """Initialize the semantify³ command."""
        super().__init__(version=Version, description=Version.description)

    def get_arg_parser(self) -> ArgumentParser:
        """Create and configure the argument parser.

        Returns:
            ArgumentParser: The configured argument parser.
        """
        parser = super().get_arg_parser()
        parser.add_argument('files', type=argparse.FileType('r'), nargs='*')

        parser.add_argument(
            "-i",
            "--input",
            type=str,
            help="Input file glob expression",
        )
        parser.add_argument(
            "-o",
            "--output",
            type=str,
            help="Output file path for triples",
        )
        parser.add_argument(
            "--format",
            type=str,
            choices=[
                "turtle",
                "n3",
                "ntriples",
                "xml",
                "json-ld",
                "sidif",
                "graphml",  # Supported by Gremlin and Neo4j (via APOC)
                "graphson",  # Gremlin specific JSON
                "cypher",  # Neo4j Cypher CREATE statements
            ],
            default="turtle",
            help="Output serialization format (default: turtle)",
        )
        return parser

    def handle_args(self, args: Namespace) -> bool:
        """Handle parsed arguments.

        Args:
            args: Parsed argument namespace.

        Returns:
            bool: True if handled, False otherwise.
        """
        handled = super().handle_args(args)
        if handled:
            return True

        if args.input or args.files:
            extractor = Extractor(debug=self.debug)
            markups = []
            if args.input:
                markups.extend(extractor.extract_from_glob(args.input))
            if args.files:
                for file_path in args.files:
                    markups.extend(extractor.extract_from_file(file_path))
            if args.verbose:
                print(f"Found {len(markups)} markups")
            for i, markup in enumerate(markups):
                print(f"{i+1}: {markup.lang} in {os.path.basename(markup.source)}")
                print(markup.code)
                print("-" * 20)
        return True

        return False

`init()`

Initialize the semantify³ command.

Source code in sem3/sem3_cmd.py

def __init__(self):
    """Initialize the semantify³ command."""
    super().__init__(version=Version, description=Version.description)

`get_arg_parser()`

Create and configure the argument parser.

Returns:

Name	Type	Description
`ArgumentParser`	`ArgumentParser`	The configured argument parser.

Source code in sem3/sem3_cmd.py

def get_arg_parser(self) -> ArgumentParser:
    """Create and configure the argument parser.

    Returns:
        ArgumentParser: The configured argument parser.
    """
    parser = super().get_arg_parser()
    parser.add_argument('files', type=argparse.FileType('r'), nargs='*')

    parser.add_argument(
        "-i",
        "--input",
        type=str,
        help="Input file glob expression",
    )
    parser.add_argument(
        "-o",
        "--output",
        type=str,
        help="Output file path for triples",
    )
    parser.add_argument(
        "--format",
        type=str,
        choices=[
            "turtle",
            "n3",
            "ntriples",
            "xml",
            "json-ld",
            "sidif",
            "graphml",  # Supported by Gremlin and Neo4j (via APOC)
            "graphson",  # Gremlin specific JSON
            "cypher",  # Neo4j Cypher CREATE statements
        ],
        default="turtle",
        help="Output serialization format (default: turtle)",
    )
    return parser

`handle_args(args)`

Handle parsed arguments.

Parameters:

Name	Type	Description	Default
`args`	`Namespace`	Parsed argument namespace.	required

Returns:

Name	Type	Description
`bool`	`bool`	True if handled, False otherwise.

Source code in sem3/sem3_cmd.py

def handle_args(self, args: Namespace) -> bool:
    """Handle parsed arguments.

    Args:
        args: Parsed argument namespace.

    Returns:
        bool: True if handled, False otherwise.
    """
    handled = super().handle_args(args)
    if handled:
        return True

    if args.input or args.files:
        extractor = Extractor(debug=self.debug)
        markups = []
        if args.input:
            markups.extend(extractor.extract_from_glob(args.input))
        if args.files:
            for file_path in args.files:
                markups.extend(extractor.extract_from_file(file_path))
        if args.verbose:
            print(f"Found {len(markups)} markups")
        for i, markup in enumerate(markups):
            print(f"{i+1}: {markup.lang} in {os.path.basename(markup.source)}")
            print(markup.code)
            print("-" * 20)
    return True

    return False

`main(argv=None)`

Main entry point for semantify3 CLI.

Parameters:

Name	Type	Description	Default
`argv`		Command line arguments.	`None`

Returns:

Name	Type	Description
`int`	`int`	Exit code.

Source code in sem3/sem3_cmd.py

def main(argv=None) -> int:
    """Main entry point for semantify3 CLI.

    Args:
        argv: Command line arguments.

    Returns:
        int: Exit code.
    """
    cmd = Semantify3Cmd()
    return cmd.run(argv)

`version`

Created on 2025-11-29.

@author: wf

`Version`

Version handling for semantify³.

Source code in sem3/version.py

@lod_storable
class Version:
    """Version handling for semantify³."""

    name = "semantify³"
    version = sem3.__version__
    date = "2025-11-29"
    updated = "2025-11-29"
    description = "Extract knowledge graph ready triples from human-readable annotations wherever possible — Syntax matters!"

    authors = "Wolfgang Fahl, Tim Holzheim"

    doc_url = "https://wiki.bitplan.com/index.php/semantify3"
    chat_url = "https://github.com/BITPlan/semantify3/discussions"
    cm_url = "https://github.com/BITPlan/semantify3"

    license = """Copyright 2025 contributors. All rights reserved.

  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied."""

    longDescription = f"""{name} version {version}
{description}

  Created by {authors} on {date} last updated {updated}"""

semantify3 API Documentation

extractor

Extractor

__init__(marker='🌐🕸', debug=False)

extract_from_file(filepath)

extract_from_glob(pattern)

extract_from_glob_list(patterns)

extract_from_text(text, source_path=None)

Markup dataclass

sem3_cmd

Semantify3Cmd

__init__()

get_arg_parser()

handle_args(args)

main(argv=None)

version

Version

`extractor`

`Extractor`

`init(marker='🌐🕸', debug=False)`

`extract_from_file(filepath)`

`extract_from_glob(pattern)`

`extract_from_glob_list(patterns)`

`extract_from_text(text, source_path=None)`

`Markup` `dataclass`

`sem3_cmd`

`Semantify3Cmd`

`init()`

`get_arg_parser()`

`handle_args(args)`

`main(argv=None)`

`version`

`Version`