feat(blocks): Add Code extraction Block (#8778)

This adds a code extraction block, this was originally made by https://github.com/SerchioSD I simply updated it and made it into a PR ### Changes 🏗️ Adds a new ``code_extraction_block.py`` block which has the code ![image](https://github.com/user-attachments/assets/f7e61390-94e1-49e3-b8ee-b2dc7ea03bfe) ### Updated video to show it working with latest mapped aliases https://github.com/user-attachments/assets/a96aa708-f06f-4a00-a581-9f64d72f9ee8 --------- Co-authored-by: SerchioSD <69461657+serchiosd@users.noreply.github.com> Co-authored-by: Nicholas Tindle <nicholas.tindle@agpt.co> Co-authored-by: Nicholas Tindle <nicktindle@outlook.com>
2025-01-09 04:19:02 +08:00 · 2024-12-12 20:34:21 +00:00 · 2024-12-12 20:34:21 +00:00 · f090f4ca4a
commit f090f4ca4a
parent 29c771ba1b
1 changed files with 110 additions and 0 deletions
--- a/autogpt_platform/backend/backend/blocks/code_extraction_block.py
+++ b/autogpt_platform/backend/backend/blocks/code_extraction_block.py
@ -0,0 +1,110 @@
+import re
+
+from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
+from backend.data.model import SchemaField
+
+
+class CodeExtractionBlock(Block):
+    class Input(BlockSchema):
+        text: str = SchemaField(
+            description="Text containing code blocks to extract (e.g., AI response)",
+            placeholder="Enter text containing code blocks",
+        )
+
+    class Output(BlockSchema):
+        html: str = SchemaField(description="Extracted HTML code")
+        css: str = SchemaField(description="Extracted CSS code")
+        javascript: str = SchemaField(description="Extracted JavaScript code")
+        python: str = SchemaField(description="Extracted Python code")
+        sql: str = SchemaField(description="Extracted SQL code")
+        java: str = SchemaField(description="Extracted Java code")
+        cpp: str = SchemaField(description="Extracted C++ code")
+        csharp: str = SchemaField(description="Extracted C# code")
+        json_code: str = SchemaField(description="Extracted JSON code")
+        bash: str = SchemaField(description="Extracted Bash code")
+        php: str = SchemaField(description="Extracted PHP code")
+        ruby: str = SchemaField(description="Extracted Ruby code")
+        yaml: str = SchemaField(description="Extracted YAML code")
+        markdown: str = SchemaField(description="Extracted Markdown code")
+        typescript: str = SchemaField(description="Extracted TypeScript code")
+        xml: str = SchemaField(description="Extracted XML code")
+        remaining_text: str = SchemaField(
+            description="Remaining text after code extraction"
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="d3a7d896-3b78-4f44-8b4b-48fbf4f0bcd8",
+            description="Extracts code blocks from text and identifies their programming languages",
+            categories={BlockCategory.TEXT},
+            input_schema=CodeExtractionBlock.Input,
+            output_schema=CodeExtractionBlock.Output,
+            test_input={
+                "text": "Here's a Python example:\n```python\nprint('Hello World')\n```\nAnd some HTML:\n```html\n<h1>Title</h1>\n```"
+            },
+            test_output=[
+                ("html", "<h1>Title</h1>"),
+                ("python", "print('Hello World')"),
+                ("remaining_text", "Here's a Python example:\nAnd some HTML:"),
+            ],
+        )
+
+    def run(self, input_data: Input, **kwargs) -> BlockOutput:
+        # List of supported programming languages with mapped aliases
+        language_aliases = {
+            "html": ["html", "htm"],
+            "css": ["css"],
+            "javascript": ["javascript", "js"],
+            "python": ["python", "py"],
+            "sql": ["sql"],
+            "java": ["java"],
+            "cpp": ["cpp", "c++"],
+            "csharp": ["csharp", "c#", "cs"],
+            "json_code": ["json"],
+            "bash": ["bash", "shell", "sh"],
+            "php": ["php"],
+            "ruby": ["ruby", "rb"],
+            "yaml": ["yaml", "yml"],
+            "markdown": ["markdown", "md"],
+            "typescript": ["typescript", "ts"],
+            "xml": ["xml"],
+        }
+
+        # Extract code for each language
+        for canonical_name, aliases in language_aliases.items():
+            code = ""
+            # Try each alias for the language
+            for alias in aliases:
+                code_for_alias = self.extract_code(input_data.text, alias)
+                if code_for_alias:
+                    code = code + "\n\n" + code_for_alias if code else code_for_alias
+
+            if code:  # Only yield if there's actual code content
+                yield canonical_name, code
+
+        # Remove all code blocks from the text to get remaining text
+        pattern = (
+            r"```(?:"
+            + "|".join(
+                re.escape(alias)
+                for aliases in language_aliases.values()
+                for alias in aliases
+            )
+            + r")\s+[\s\S]*?```"
+        )
+
+        remaining_text = re.sub(pattern, "", input_data.text).strip()
+        remaining_text = re.sub(r"\n\s*\n", "\n", remaining_text)
+
+        if remaining_text:  # Only yield if there's remaining text
+            yield "remaining_text", remaining_text
+
+    def extract_code(self, text: str, language: str) -> str:
+        # Escape special regex characters in the language string
+        language = re.escape(language)
+        # Extract all code blocks enclosed in ```language``` blocks
+        pattern = re.compile(rf"```{language}\s+(.*?)```", re.DOTALL | re.IGNORECASE)
+        matches = pattern.finditer(text)
+        # Combine all code blocks for this language with newlines between them
+        code_blocks = [match.group(1).strip() for match in matches]
+        return "\n\n".join(code_blocks) if code_blocks else ""