taylorwilsdon · JoaquinFernandez · Oct 22, 2025
diff --git a/gmail/gmail_tools.py b/gmail/gmail_tools.py
@@ -11,6 +11,7 @@
 from typing import Optional, List, Dict, Literal
 
 from email.mime.text import MIMEText
+from bs4 import BeautifulSoup
 
 from fastapi import Body
 from pydantic import Field
@@ -32,6 +33,38 @@
 HTML_BODY_TRUNCATE_LIMIT = 20000
 
 
+def _html_to_text(html: str) -> str:
+    """
+    Convierte HTML a texto legible.
+
+    Args:
+        html: Contenido HTML
+
+    Returns:
+        Texto plano legible
+    """
+    try:
+        # Parse HTML con BeautifulSoup
+        soup = BeautifulSoup(html, 'html.parser')
+
+        # Remover scripts y estilos
+        for script in soup(["script", "style"]):
+            script.decompose()
+
+        # Obtener texto
+        text = soup.get_text()
+
+        # Limpiar espacios en blanco excesivos
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = '\n'.join(chunk for chunk in chunks if chunk)
+
+        return text
+    except Exception as e:
+        logger.warning(f"Failed to convert HTML to text: {e}")
+        return html  # Fallback al HTML crudo
+
+
 def _extract_message_body(payload):
     """
     Helper function to extract plain text body from a Gmail message payload.
@@ -103,20 +136,39 @@ def _format_body_content(text_body: str, html_body: str) -> str:
     """
     Helper function to format message body content with HTML fallback and truncation.
 
+    Detects when text/plain is a useless fallback by checking for HTML comments.
+    Plain text should never contain HTML comments (<!--), so their presence
+    indicates the text is a fallback message, not actual content.
+
     Args:
         text_body: Plain text body content
         html_body: HTML body content
 
     Returns:
         Formatted body content string
     """
-    if text_body.strip():
+    # Detect HTML comments in plain text (indicates fallback)
+    has_html_comment = "<!--" in text_body
+
+    # Use HTML if:
+    # 1. Text is empty
+    # 2. Text contains HTML comments (fallback indicator)
+    # 3. HTML is significantly longer (50x+) than text
+    use_html = (
+        not text_body.strip() or
+        has_html_comment or
+        (html_body.strip() and len(html_body) > len(text_body) * 50)
+    )
+
+    if use_html and html_body.strip():
+        # Convert HTML to readable text
+        text_from_html = _html_to_text(html_body)
+        # Truncate very large content to keep responses manageable
+        if len(text_from_html) > HTML_BODY_TRUNCATE_LIMIT:
+            text_from_html = text_from_html[:HTML_BODY_TRUNCATE_LIMIT] + "\n\n[Content truncated...]"
+        return text_from_html
+    elif text_body.strip():
         return text_body
-    elif html_body.strip():
-        # Truncate very large HTML to keep responses manageable
-        if len(html_body) > HTML_BODY_TRUNCATE_LIMIT:
-            html_body = html_body[:HTML_BODY_TRUNCATE_LIMIT] + "\n\n[HTML content truncated...]"
-        return f"[HTML Content Converted]\n{html_body}"
     else:
         return "[No readable content found]"
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,7 @@ readme = "README.md"
 keywords = [ "mcp", "google", "workspace", "llm", "ai", "claude", "model", "context", "protocol", "server"]
 requires-python = ">=3.10"
 dependencies = [
+ "beautifulsoup4>=4.12.0",
  "fastapi>=0.115.12",
  "fastmcp==2.12.5",
  "google-api-python-client>=2.168.0",