Merge pull request #794 from airweave-ai/feat/cred_sec

orhanrauf · web-flow · commit 388f8dc9ce5e · 2025-09-17T15:31:38.000+02:00
vuln: don't log credentials
diff --git a/backend/airweave/core/credential_sanitizer.py b/backend/airweave/core/credential_sanitizer.py
@@ -0,0 +1,241 @@
+"""Credential sanitization utilities for safe logging.
+
+This module provides functions to safely log credential information without
+exposing sensitive data like API keys, tokens, passwords, etc.
+"""
+
+import re
+from typing import Any, Callable, Dict
+
+
+def sanitize_credential_value(value: Any, show_length: bool = True) -> str:
+    """Sanitize a credential value for safe logging.
+
+    Args:
+        value: The credential value to sanitize
+        show_length: Whether to show the length of the value
+
+    Returns:
+        A sanitized string representation of the value
+    """
+    if isinstance(value, str):
+        if len(value) <= 8:
+            return f"<redacted:{len(value)} chars>"
+        else:
+            preview = f"{value[:3]}...{value[-2:]}"
+            if show_length:
+                return f"<redacted:{len(value)} chars:{preview}>"
+            else:
+                return f"<redacted:{preview}>"
+    elif isinstance(value, (int, float)):
+        return f"<redacted {type(value).__name__}>"
+    elif isinstance(value, bool):
+        return f"<redacted bool:{value}>"
+    elif value is None:
+        return "<redacted:null>"
+    else:
+        return f"<redacted {type(value).__name__}>"
+
+
+def sanitize_credentials_dict(
+    credentials: Dict[str, Any], show_lengths: bool = True
+) -> Dict[str, str]:
+    """Sanitize a dictionary of credentials for safe logging.
+
+    Args:
+        credentials: Dictionary containing credential data
+        show_lengths: Whether to show lengths of string values
+
+    Returns:
+        Dictionary with sanitized values
+    """
+    sanitized = {}
+    for key, value in credentials.items():
+        sanitized[key] = sanitize_credential_value(value, show_lengths)
+    return sanitized
+
+
+def get_safe_credential_summary(credentials: Dict[str, Any]) -> str:
+    """Get a safe summary of credentials without exposing sensitive data.
+
+    Args:
+        credentials: Dictionary containing credential data
+
+    Returns:
+        A safe string summary of the credentials
+    """
+    if not credentials:
+        return "No credentials found"
+
+    # Count sensitive vs non-sensitive fields
+    sensitive_fields = []
+    non_sensitive_fields = []
+
+    for key in credentials.keys():
+        if _is_sensitive_field(key):
+            sensitive_fields.append(key)
+        else:
+            non_sensitive_fields.append(key)
+
+    summary_parts = [
+        f"Total fields: {len(credentials)}",
+        f"Sensitive fields: {len(sensitive_fields)}",
+        f"Non-sensitive fields: {len(non_sensitive_fields)}",
+    ]
+
+    if non_sensitive_fields:
+        summary_parts.append(f"Non-sensitive: {non_sensitive_fields}")
+
+    if sensitive_fields:
+        summary_parts.append(f"Sensitive: {sensitive_fields}")
+
+    return " | ".join(summary_parts)
+
+
+def _is_sensitive_field(field_name: str) -> bool:
+    """Check if a field name indicates sensitive data.
+
+    Args:
+        field_name: The name of the field to check
+
+    Returns:
+        True if the field likely contains sensitive data
+    """
+    sensitive_patterns = [
+        r"token",
+        r"key",
+        r"secret",
+        r"password",
+        r"credential",
+        r"auth",
+        r"access",
+        r"refresh",
+        r"bearer",
+        r"api_key",
+        r"client_secret",
+        r"private",
+        r"session",
+        r"cookie",
+    ]
+
+    field_lower = field_name.lower()
+    return any(re.search(pattern, field_lower) for pattern in sensitive_patterns)
+
+
+def safe_log_credentials(
+    credentials: Dict[str, Any],
+    logger_func: Callable[[str], None],
+    message_prefix: str = "",
+) -> None:
+    """Safely log credentials using the provided logger function.
+
+    Args:
+        credentials: Dictionary containing credential data
+        logger_func: Logger function to use (e.g., logger.info, logger.debug)
+        message_prefix: Optional prefix for the log message
+    """
+    summary = get_safe_credential_summary(credentials)
+    if message_prefix:
+        logger_func(f"{message_prefix} {summary}")
+    else:
+        logger_func(summary)
+
+
+def safe_log_credential_fields(
+    credentials: Dict[str, Any],
+    logger_func: Callable[[str], None],
+    message_prefix: str = "",
+) -> None:
+    """Safely log credential field names and types without values.
+
+    Args:
+        credentials: Dictionary containing credential data
+        logger_func: Logger function to use
+        message_prefix: Optional prefix for the log message
+    """
+    if not credentials:
+        logger_func(f"{message_prefix} No credential fields")
+        return
+
+    field_info = []
+    for key, value in credentials.items():
+        field_type = type(value).__name__
+        if isinstance(value, str):
+            field_info.append(f"{key}({field_type}:{len(value)} chars)")
+        else:
+            field_info.append(f"{key}({field_type})")
+
+    fields_str = ", ".join(field_info)
+    if message_prefix:
+        logger_func(f"{message_prefix} Fields: {fields_str}")
+    else:
+        logger_func(f"Credential fields: {fields_str}")
+
+
+def safe_log_token_info(
+    token: str, logger_func: Callable[[str], None], message_prefix: str = ""
+) -> None:
+    """Safely log token information without exposing the actual token.
+
+    Args:
+        token: The token to log information about
+        logger_func: Logger function to use
+        message_prefix: Optional prefix for the log message
+    """
+    if not token:
+        logger_func(f"{message_prefix} No token provided")
+        return
+
+    token_info = f"Length: {len(token)}"
+    if len(token) > 10:
+        token_info += f", Preview: {token[:3]}...{token[-3:]}"
+
+    if message_prefix:
+        logger_func(f"{message_prefix} {token_info}")
+    else:
+        logger_func(f"Token info: {token_info}")
+
+
+def safe_log_auth_values(
+    auth_values: Dict[str, Any],
+    logger_func: Callable[[str], None],
+    message_prefix: str = "",
+) -> None:
+    """Safely log auth values without exposing sensitive data.
+
+    Args:
+        auth_values: Dictionary containing auth values
+        logger_func: Logger function to use
+        message_prefix: Optional prefix for the log message
+    """
+    if not auth_values:
+        logger_func(f"{message_prefix} No auth values")
+        return
+
+    # Separate sensitive and non-sensitive fields
+    sensitive_fields = []
+    non_sensitive_fields = []
+
+    for key in auth_values.keys():
+        if _is_sensitive_field(key):
+            sensitive_fields.append(key)
+        else:
+            non_sensitive_fields.append(key)
+
+    info_parts = [
+        f"Total: {len(auth_values)}",
+        f"Sensitive: {len(sensitive_fields)}",
+        f"Non-sensitive: {len(non_sensitive_fields)}",
+    ]
+
+    if non_sensitive_fields:
+        info_parts.append(f"Non-sensitive fields: {non_sensitive_fields}")
+
+    if sensitive_fields:
+        info_parts.append(f"Sensitive fields: {sensitive_fields}")
+
+    info_str = " | ".join(info_parts)
+    if message_prefix:
+        logger_func(f"{message_prefix} {info_str}")
+    else:
+        logger_func(f"Auth values: {info_str}")
diff --git a/backend/airweave/platform/auth_providers/composio.py b/backend/airweave/platform/auth_providers/composio.py
@@ -5,6 +5,10 @@
 import httpx
 from fastapi import HTTPException
 
+from airweave.core.credential_sanitizer import (
+    safe_log_credentials,
+    sanitize_credentials_dict,
+)
 from airweave.platform.auth.schemas import AuthType
 from airweave.platform.auth_providers._base import BaseAuthProvider
 from airweave.platform.decorators import auth_provider
@@ -80,7 +84,10 @@ def _map_field_name(self, airweave_field: str) -> str:
         return self.FIELD_NAME_MAPPING.get(airweave_field, airweave_field)
 
     async def _get_with_auth(
-        self, client: httpx.AsyncClient, url: str, params: Optional[Dict[str, Any]] = None
+        self,
+        client: httpx.AsyncClient,
+        url: str,
+        params: Optional[Dict[str, Any]] = None,
     ) -> Dict[str, Any]:
         """Make authenticated API request using Composio API key.
 
@@ -159,7 +166,11 @@ async def get_creds_for_source(
 
             # TODO: pagination
 
-            self.logger.info(f"\n🔑 [Composio] Found credentials: {found_credentials}\n")
+            safe_log_credentials(
+                found_credentials,
+                self.logger.info,
+                f"\n🔑 [Composio] Retrieved credentials for '{source_short_name}':",
+            )
             return found_credentials
 
     async def _get_source_connected_accounts(
@@ -267,12 +278,13 @@ def _find_matching_connection(
                     self.logger.info(
                         f"\n🔓 [Composio] Available credential fields: {available_fields}\n"
                     )
-                    for field, value in source_creds_dict.items():
-                        if isinstance(value, str) and len(value) > 10:
-                            preview = f"{value[:5]}...{value[-3:]}"
-                        else:
-                            preview = "<non-string or short value>"
-                        self.logger.debug(f"\n  - {field}: {preview}\n")
+                    # Log credential fields safely without exposing values
+                    sanitized_preview = sanitize_credentials_dict(
+                        source_creds_dict, show_lengths=False
+                    )
+                    self.logger.debug(
+                        f"\n🔓 [Composio] Credential fields preview: {sanitized_preview}\n"
+                    )
                 break
 
         if not source_creds_dict:
diff --git a/backend/airweave/platform/auth_providers/pipedream.py b/backend/airweave/platform/auth_providers/pipedream.py
@@ -6,6 +6,7 @@
 import httpx
 from fastapi import HTTPException
 
+from airweave.core.credential_sanitizer import safe_log_credentials
 from airweave.platform.auth.schemas import AuthType
 from airweave.platform.auth_providers._base import BaseAuthProvider
 from airweave.platform.decorators import auth_provider
@@ -263,7 +264,11 @@ async def get_creds_for_source(
                 account_data, source_auth_config_fields, source_short_name
             )
 
-            self.logger.info(f"\n🔑 [Pipedream] Found credentials: {found_credentials}\n")
+            safe_log_credentials(
+                found_credentials,
+                self.logger.info,
+                f"\n🔑 [Pipedream] Retrieved credentials for '{source_short_name}':",
+            )
             return found_credentials
 
     async def _get_account_with_credentials(
diff --git a/frontend/src/lib/auth-context.tsx b/frontend/src/lib/auth-context.tsx
@@ -59,10 +59,8 @@ export const AuthProvider: React.FC<{ children: React.ReactNode }> = ({ children
           setTokenInitialized(true);
           console.log('Auth initialization complete');
 
-          // Log the token for debugging
-          console.log('Token length:', accessToken.length);
-          // Print first and last 10 characters of token
-          console.log('Token preview:', accessToken.substring(0, 10) + '...' + accessToken.substring(accessToken.length - 10));
+          // Log token acquisition for debugging (without exposing token content)
+          console.log('🔑 Access token acquired successfully, length:', accessToken.length);
         } catch (error) {
           console.error('Error getting access token', error);
           setToken(null);
diff --git a/frontend/src/pages/SemanticMcp.tsx b/frontend/src/pages/SemanticMcp.tsx
@@ -345,10 +345,18 @@ const SemanticMcp = () => {
         }
     }, [searchParams, setSearchParams]);
 
-    // Log authValues whenever they change
+    // Log authValues metadata whenever they change (without exposing sensitive data)
     useEffect(() => {
-        console.log('🔐 [SemanticMcp] AuthValues updated:', authValues);
-        console.log('🔐 [SemanticMcp] AuthValues keys:', Object.keys(authValues));
+        const sensitiveFields = Object.keys(authValues).filter(key =>
+            /(?:token|key|secret|password|credential)/i.test(key)
+        );
+        const nonSensitiveFields = Object.keys(authValues).filter(key =>
+            !/(?:token|key|secret|password|credential)/i.test(key)
+        );
+
+        console.log('🔐 [SemanticMcp] AuthValues updated - Total fields:', Object.keys(authValues).length,
+            'Sensitive fields:', sensitiveFields.length,
+            'Non-sensitive:', nonSensitiveFields);
     }, [authValues]);
 
     // Log configValues whenever they change
@@ -458,7 +466,7 @@ const SemanticMcp = () => {
                 auth_fields: authValues
             };
 
-            console.log('🔐 [SemanticMcp] Creating credentials for non-OAuth2 source:', credentialData);
+            console.log('🔐 [SemanticMcp] Creating credentials for non-OAuth2 source:', sourceShortName, 'with', Object.keys(credentialData).length, 'fields');
 
             // Make API call to create credentials
             const response = await apiClient.post(
diff --git a/monke/auth/broker.py b/monke/auth/broker.py