Skip to content

semantify3 API Documentation

semantify³ - Extract knowledge graph ready triples from human-readable annotations wherever possible.

Syntax matters!

Created: 2025-01-29 Authors: Wolfgang Fahl, Tim Holzheim Repository: https://github.com/BITPlan/semantify3

extractor

Command-line interface for semantify³.

# 🌐🕸
extractor:
  isA: PythonModule
  author: Wolfgang Fahl
  createdAt: 2025-11-29
  purpose: extraction of relevant markup snippets for semantify³.

Extractor

Extract semantic annotation markup from files.

Source code in sem3/extractor.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class Extractor:
    """Extract semantic annotation markup from files."""

    def __init__(self, marker: str = "🌐🕸", debug: bool = False):
        """constructor."""
        self.marker = marker
        self.debug = debug
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG if debug else logging.INFO)

    def log(self, msg: str):
        if self.debug:
            self.logger.debug(msg)

    def extract_from_file(self, filepath: str) -> List[Markup]:
        """Extract markup snippets from a single file.

        Args:
            filepath: Path to the file to extract from.

        Returns:
            List[Markup]: List of extracted markup snippets.
        """
        markups = []
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
            markups = self.extract_from_text(content, source_path=filepath)
        except (IOError, UnicodeDecodeError) as e:
            self.logger.warning(f"Error reading {filepath}: {e}")
            markups = []
        return markups

    def extract_from_text(
        self, text: str, source_path: Optional[str] = None
    ) -> List[Markup]:
        """Extract all semantic markup snippets from text.

        Args:
            text: The source text to extract from.
            source_path: Optional file path for location tracking.

        Returns:
            List[Markup]: List of extracted markup snippets.
        """
        markups = []

        # Pattern to match code blocks with yaml or sidif
        pattern = re.compile(
            r"```(yaml|sidif)\s*\n"  # Opening fence with language
            r"(.*?)"  # Content (non-greedy)
            r"\n\s*```",  # Closing fence
            re.DOTALL,
        )

        for match in pattern.finditer(text):
            lang = match.group(1)
            raw_content = match.group(2)

            # Find first non-empty line
            lines = raw_content.split("\n")
            first_content_idx = None

            for idx, line in enumerate(lines):
                if line.strip():
                    first_content_idx = idx
                    break

            if first_content_idx is None:
                continue

            first_line = lines[first_content_idx].strip()

            # Check for marker
            if self.marker not in first_line:
                continue

            # Extract content after marker line
            content_lines = lines[first_content_idx + 1 :]
            code = "\n".join(content_lines).strip()

            if not code:
                continue

            # Calculate source line (1-based)
            line_num = text[: match.start()].count("\n") + 1

            source = ""
            if source_path:
                source = f"{source_path}:{line_num}"

            markup = Markup(lang=lang, code=code, source=source)
            markups.append(markup)

        if self.debug and len(markups) > 0:
            self.log(f"Found {len(markups)} snippets in {source_path}")

        return markups

    def extract_from_glob(self, pattern: str) -> List[Markup]:
        """Extract markup snippets from files matching a glob pattern.

        Args:
            pattern: Glob pattern to match files (supports **).

        Returns:
            List[Markup]: All markup snippets from matching files.
        """
        all_markups = []

        files = glob.glob(pattern, recursive=True)
        self.log(f"Glob pattern '{pattern}' found {len(files)} files")

        for filepath in files:
            markups = self.extract_from_file(filepath)
            all_markups.extend(markups)

        return all_markups

    def extract_from_glob_list(self, patterns: List[str]) -> List[Markup]:
        """Extract markup snippets from files matching multiple glob patterns.

        Args:
            patterns: List of glob patterns to match files.

        Returns:
            List[Markup]: All markup snippets from matching files.
        """
        all_markups = []

        self.log(f"Processing {len(patterns)} glob patterns")

        for pattern in patterns:
            self.log(f"Checking pattern: {pattern}")
            markups = self.extract_from_glob(pattern)
            all_markups.extend(markups)

        return all_markups

__init__(marker='🌐🕸', debug=False)

constructor.

Source code in sem3/extractor.py
35
36
37
38
39
40
def __init__(self, marker: str = "🌐🕸", debug: bool = False):
    """constructor."""
    self.marker = marker
    self.debug = debug
    self.logger = logging.getLogger(__name__)
    self.logger.setLevel(logging.DEBUG if debug else logging.INFO)

extract_from_file(filepath)

Extract markup snippets from a single file.

Parameters:

Name Type Description Default
filepath str

Path to the file to extract from.

required

Returns:

Type Description
List[Markup]

List[Markup]: List of extracted markup snippets.

Source code in sem3/extractor.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def extract_from_file(self, filepath: str) -> List[Markup]:
    """Extract markup snippets from a single file.

    Args:
        filepath: Path to the file to extract from.

    Returns:
        List[Markup]: List of extracted markup snippets.
    """
    markups = []
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()
        markups = self.extract_from_text(content, source_path=filepath)
    except (IOError, UnicodeDecodeError) as e:
        self.logger.warning(f"Error reading {filepath}: {e}")
        markups = []
    return markups

extract_from_glob(pattern)

Extract markup snippets from files matching a glob pattern.

Parameters:

Name Type Description Default
pattern str

Glob pattern to match files (supports **).

required

Returns:

Type Description
List[Markup]

List[Markup]: All markup snippets from matching files.

Source code in sem3/extractor.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def extract_from_glob(self, pattern: str) -> List[Markup]:
    """Extract markup snippets from files matching a glob pattern.

    Args:
        pattern: Glob pattern to match files (supports **).

    Returns:
        List[Markup]: All markup snippets from matching files.
    """
    all_markups = []

    files = glob.glob(pattern, recursive=True)
    self.log(f"Glob pattern '{pattern}' found {len(files)} files")

    for filepath in files:
        markups = self.extract_from_file(filepath)
        all_markups.extend(markups)

    return all_markups

extract_from_glob_list(patterns)

Extract markup snippets from files matching multiple glob patterns.

Parameters:

Name Type Description Default
patterns List[str]

List of glob patterns to match files.

required

Returns:

Type Description
List[Markup]

List[Markup]: All markup snippets from matching files.

Source code in sem3/extractor.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def extract_from_glob_list(self, patterns: List[str]) -> List[Markup]:
    """Extract markup snippets from files matching multiple glob patterns.

    Args:
        patterns: List of glob patterns to match files.

    Returns:
        List[Markup]: All markup snippets from matching files.
    """
    all_markups = []

    self.log(f"Processing {len(patterns)} glob patterns")

    for pattern in patterns:
        self.log(f"Checking pattern: {pattern}")
        markups = self.extract_from_glob(pattern)
        all_markups.extend(markups)

    return all_markups

extract_from_text(text, source_path=None)

Extract all semantic markup snippets from text.

Parameters:

Name Type Description Default
text str

The source text to extract from.

required
source_path Optional[str]

Optional file path for location tracking.

None

Returns:

Type Description
List[Markup]

List[Markup]: List of extracted markup snippets.

Source code in sem3/extractor.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def extract_from_text(
    self, text: str, source_path: Optional[str] = None
) -> List[Markup]:
    """Extract all semantic markup snippets from text.

    Args:
        text: The source text to extract from.
        source_path: Optional file path for location tracking.

    Returns:
        List[Markup]: List of extracted markup snippets.
    """
    markups = []

    # Pattern to match code blocks with yaml or sidif
    pattern = re.compile(
        r"```(yaml|sidif)\s*\n"  # Opening fence with language
        r"(.*?)"  # Content (non-greedy)
        r"\n\s*```",  # Closing fence
        re.DOTALL,
    )

    for match in pattern.finditer(text):
        lang = match.group(1)
        raw_content = match.group(2)

        # Find first non-empty line
        lines = raw_content.split("\n")
        first_content_idx = None

        for idx, line in enumerate(lines):
            if line.strip():
                first_content_idx = idx
                break

        if first_content_idx is None:
            continue

        first_line = lines[first_content_idx].strip()

        # Check for marker
        if self.marker not in first_line:
            continue

        # Extract content after marker line
        content_lines = lines[first_content_idx + 1 :]
        code = "\n".join(content_lines).strip()

        if not code:
            continue

        # Calculate source line (1-based)
        line_num = text[: match.start()].count("\n") + 1

        source = ""
        if source_path:
            source = f"{source_path}:{line_num}"

        markup = Markup(lang=lang, code=code, source=source)
        markups.append(markup)

    if self.debug and len(markups) > 0:
        self.log(f"Found {len(markups)} snippets in {source_path}")

    return markups

Markup dataclass

A single markup.

Source code in sem3/extractor.py
22
23
24
25
26
27
28
29
@lod_storable
@dataclass
class Markup:
    """A single markup."""

    lang: str
    code: str
    source: str

sem3_cmd

Command-line interface for semantify³.

🌐🕸
sem3_cmd:
  isA: PythonModule
  author: Wolfgang Fahl
  createdAt: 2025-11-29
  purpose: Command-line interface for semantify³.

Semantify3Cmd

Bases: BaseCmd

Command line interface for semantify³.

Source code in sem3/sem3_cmd.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
class Semantify3Cmd(BaseCmd):
    """Command line interface for semantify³."""

    def __init__(self):
        """Initialize the semantify³ command."""
        super().__init__(version=Version, description=Version.description)

    def get_arg_parser(self) -> ArgumentParser:
        """Create and configure the argument parser.

        Returns:
            ArgumentParser: The configured argument parser.
        """
        parser = super().get_arg_parser()
        parser.add_argument('files', type=argparse.FileType('r'), nargs='*')

        parser.add_argument(
            "-i",
            "--input",
            type=str,
            help="Input file glob expression",
        )
        parser.add_argument(
            "-o",
            "--output",
            type=str,
            help="Output file path for triples",
        )
        parser.add_argument(
            "--format",
            type=str,
            choices=[
                "turtle",
                "n3",
                "ntriples",
                "xml",
                "json-ld",
                "sidif",
                "graphml",  # Supported by Gremlin and Neo4j (via APOC)
                "graphson",  # Gremlin specific JSON
                "cypher",  # Neo4j Cypher CREATE statements
            ],
            default="turtle",
            help="Output serialization format (default: turtle)",
        )
        return parser

    def handle_args(self, args: Namespace) -> bool:
        """Handle parsed arguments.

        Args:
            args: Parsed argument namespace.

        Returns:
            bool: True if handled, False otherwise.
        """
        handled = super().handle_args(args)
        if handled:
            return True

        if args.input or args.files:
            extractor = Extractor(debug=self.debug)
            markups = []
            if args.input:
                markups.extend(extractor.extract_from_glob(args.input))
            if args.files:
                for file_path in args.files:
                    markups.extend(extractor.extract_from_file(file_path))
            if args.verbose:
                print(f"Found {len(markups)} markups")
            for i, markup in enumerate(markups):
                print(f"{i+1}: {markup.lang} in {os.path.basename(markup.source)}")
                print(markup.code)
                print("-" * 20)
        return True

        return False

__init__()

Initialize the semantify³ command.

Source code in sem3/sem3_cmd.py
26
27
28
def __init__(self):
    """Initialize the semantify³ command."""
    super().__init__(version=Version, description=Version.description)

get_arg_parser()

Create and configure the argument parser.

Returns:

Name Type Description
ArgumentParser ArgumentParser

The configured argument parser.

Source code in sem3/sem3_cmd.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def get_arg_parser(self) -> ArgumentParser:
    """Create and configure the argument parser.

    Returns:
        ArgumentParser: The configured argument parser.
    """
    parser = super().get_arg_parser()
    parser.add_argument('files', type=argparse.FileType('r'), nargs='*')

    parser.add_argument(
        "-i",
        "--input",
        type=str,
        help="Input file glob expression",
    )
    parser.add_argument(
        "-o",
        "--output",
        type=str,
        help="Output file path for triples",
    )
    parser.add_argument(
        "--format",
        type=str,
        choices=[
            "turtle",
            "n3",
            "ntriples",
            "xml",
            "json-ld",
            "sidif",
            "graphml",  # Supported by Gremlin and Neo4j (via APOC)
            "graphson",  # Gremlin specific JSON
            "cypher",  # Neo4j Cypher CREATE statements
        ],
        default="turtle",
        help="Output serialization format (default: turtle)",
    )
    return parser

handle_args(args)

Handle parsed arguments.

Parameters:

Name Type Description Default
args Namespace

Parsed argument namespace.

required

Returns:

Name Type Description
bool bool

True if handled, False otherwise.

Source code in sem3/sem3_cmd.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def handle_args(self, args: Namespace) -> bool:
    """Handle parsed arguments.

    Args:
        args: Parsed argument namespace.

    Returns:
        bool: True if handled, False otherwise.
    """
    handled = super().handle_args(args)
    if handled:
        return True

    if args.input or args.files:
        extractor = Extractor(debug=self.debug)
        markups = []
        if args.input:
            markups.extend(extractor.extract_from_glob(args.input))
        if args.files:
            for file_path in args.files:
                markups.extend(extractor.extract_from_file(file_path))
        if args.verbose:
            print(f"Found {len(markups)} markups")
        for i, markup in enumerate(markups):
            print(f"{i+1}: {markup.lang} in {os.path.basename(markup.source)}")
            print(markup.code)
            print("-" * 20)
    return True

    return False

main(argv=None)

Main entry point for semantify3 CLI.

Parameters:

Name Type Description Default
argv

Command line arguments.

None

Returns:

Name Type Description
int int

Exit code.

Source code in sem3/sem3_cmd.py
102
103
104
105
106
107
108
109
110
111
112
def main(argv=None) -> int:
    """Main entry point for semantify3 CLI.

    Args:
        argv: Command line arguments.

    Returns:
        int: Exit code.
    """
    cmd = Semantify3Cmd()
    return cmd.run(argv)

version

Created on 2025-11-29.

@author: wf

Version

Version handling for semantify³.

Source code in sem3/version.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
@lod_storable
class Version:
    """Version handling for semantify³."""

    name = "semantify³"
    version = sem3.__version__
    date = "2025-11-29"
    updated = "2025-11-29"
    description = "Extract knowledge graph ready triples from human-readable annotations wherever possible — Syntax matters!"

    authors = "Wolfgang Fahl, Tim Holzheim"

    doc_url = "https://wiki.bitplan.com/index.php/semantify3"
    chat_url = "https://github.com/BITPlan/semantify3/discussions"
    cm_url = "https://github.com/BITPlan/semantify3"

    license = """Copyright 2025 contributors. All rights reserved.

  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied."""

    longDescription = f"""{name} version {version}
{description}

  Created by {authors} on {date} last updated {updated}"""