Merge pull request #39 from akvo/consistent-metrics-order

jcbashdown · web-flow · commit 46d7a4dfbb69 · 2025-08-05T15:07:43.000+03:00
Consistent metrics order
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -37,6 +37,10 @@ jobs:
           # Test backend health
           curl -v http://localhost:80/api/health
 
+      - name: Run backend tests
+        run: |
+          ./backend/test.sh
+
       - name: Show logs if failure
         if: failure()
         run: docker compose logs
diff --git a/backend/RAG_evaluation/README.md b/backend/RAG_evaluation/README.md
@@ -387,14 +387,14 @@ BROWSER_SLOW_MO=1000
 - Reference answers properly entered and used
 
 **8 Metrics Verified:**
-1. Faithfulness
-2. Answer Relevancy
-3. Context Precision Without Reference
-4. Context Relevancy
-5. Answer Similarity 📚
-6. Answer Correctness 📚
-7. Context Precision 📚
-8. Context Recall 📚
+1.🧠 Faithfulness
+2.🧠 Context Relevancy
+3.Answer Relevancy
+4.🧠 Context Precision Without Reference
+5.🧠📚 Context Recall
+6.🧠📚 Context Precision
+7.📚 Answer Similarity
+8.📚 Answer Correctness
 
 *(📚 = Reference-based metrics requiring reference answers)*
 
diff --git a/backend/RAG_evaluation/streamlit_app/components/metrics_explanation.py b/backend/RAG_evaluation/streamlit_app/components/metrics_explanation.py
@@ -109,13 +109,13 @@ def get_metric_description(metric_name: str) -> str:
         """
         descriptions = {
             'faithfulness': "How well grounded the response is in the retrieved context",
-            'answer_relevancy': "How relevant the response is to the original query",
             'context_relevancy': "How relevant the retrieved context is to the query",
+            'answer_relevancy': "How relevant the response is to the original query",
             'context_precision_without_reference': "Precision of context retrieval without reference answers",
-            'answer_similarity': "Semantic similarity between generated and reference answers",
-            'answer_correctness': "Factual accuracy against reference answers",
+            'context_recall': "How well retrieved contexts cover the reference answer",
             'context_precision': "More accurate precision using reference answers",
-            'context_recall': "How well retrieved contexts cover the reference answer"
+            'answer_similarity': "Semantic similarity between generated and reference answers",
+            'answer_correctness': "Factual accuracy against reference answers"
         }
         
         return descriptions.get(metric_name, "Evaluation metric")
@@ -154,13 +154,13 @@ def get_metric_help_text(metric_name: str) -> str:
         """
         help_texts = {
             'faithfulness': "Measures how well the generated answer is supported by the retrieved context. Higher scores indicate better factual consistency.",
-            'answer_relevancy': "Evaluates how well the answer addresses the original question. Higher scores indicate more relevant responses.",
             'context_relevancy': "Assesses the relevance of retrieved context to the query. Higher scores indicate better context retrieval.",
+            'answer_relevancy': "Evaluates how well the answer addresses the original question. Higher scores indicate more relevant responses.",
             'context_precision_without_reference': "Measures precision of context retrieval without requiring reference answers. Higher scores indicate more precise retrieval.",
-            'answer_similarity': "Compares semantic similarity between generated and reference answers. Higher scores indicate closer alignment.",
-            'answer_correctness': "Evaluates factual accuracy against reference answers. Higher scores indicate better correctness.",
+            'context_recall': "Measures how well retrieved contexts cover information in the reference answer. Higher scores indicate better coverage.",
             'context_precision': "More accurate precision measurement using reference answers for comparison. Higher scores indicate better precision.",
-            'context_recall': "Measures how well retrieved contexts cover information in the reference answer. Higher scores indicate better coverage."
+            'answer_similarity': "Compares semantic similarity between generated and reference answers. Higher scores indicate closer alignment.",
+            'answer_correctness': "Evaluates factual accuracy against reference answers. Higher scores indicate better correctness."
         }
         
         return help_texts.get(metric_name, "RAGAS evaluation metric")
diff --git a/backend/RAG_evaluation/streamlit_app/constants.py b/backend/RAG_evaluation/streamlit_app/constants.py
@@ -19,16 +19,16 @@
 # Metric categories
 BASIC_METRICS: List[str] = [
     'faithfulness', 
+    'context_relevancy',
     'answer_relevancy', 
-    'context_precision_without_reference', 
-    'context_relevancy'
+    'context_precision_without_reference'
 ]
 
 REFERENCE_METRICS: List[str] = [
+    'context_recall',
+    'context_precision',
     'answer_similarity', 
-    'answer_correctness', 
-    'context_precision', 
-    'context_recall'
+    'answer_correctness'
 ]
 
 ALL_METRICS: List[str] = BASIC_METRICS + REFERENCE_METRICS
@@ -189,24 +189,24 @@
 SHORT_METRICS_EXPLANATIONS: Dict[str, str] = {
     'reference_free': """
 **Reference-Free Metrics** (work without reference answers):
-- **Faithfulness** 🧠: How well grounded the response is in the retrieved context
+- **🧠 Faithfulness**: How well grounded the response is in the retrieved context
+- **🧠 Context Relevancy**: How relevant the retrieved context is to the query
 - **Answer Relevancy**: How relevant the response is to the original query
-- **Context Relevancy** 🧠: How relevant the retrieved context is to the query
-- **Context Precision Without Reference** 🧠: Precision of context retrieval without reference answers
+- **🧠 Context Precision Without Reference**: Precision of context retrieval without reference answers
 
 **Reference-Based Metrics** (require reference answers for comparison):
-- **Answer Similarity** 📚: Semantic similarity between generated and reference answers
-- **Answer Correctness** 📚: Factual accuracy against reference answers
-- **Context Precision** 🧠📚: More accurate precision using reference answers
-- **Context Recall** 🧠📚: How well retrieved contexts cover the reference answer
+- **🧠📚 Context Recall**: How well retrieved contexts cover the reference answer
+- **🧠📚 Context Precision**: More accurate precision using reference answers
+- **📚 Answer Similarity**: Semantic similarity between generated and reference answers
+- **📚 Answer Correctness**: Factual accuracy against reference answers
 
 🧠 = Context-dependent | 📚 = Reference-based | *All metrics range from 0.0 to 1.0, with higher scores indicating better performance.*
 """,
     'basic_only': """
 **Context-dependent metrics** 🧠 require retrieved context/documents:
-- **Faithfulness**: How well grounded the response is in the retrieved context
-- **Context Relevancy**: How relevant the retrieved context is to the query
-- **Context Precision Without Reference**: Precision of context retrieval without reference answers
+- **🧠 Faithfulness**: How well grounded the response is in the retrieved context
+- **🧠 Context Relevancy**: How relevant the retrieved context is to the query
+- **🧠 Context Precision Without Reference**: Precision of context retrieval without reference answers
 
 **Response-only metrics** evaluate the generated response quality:
 - **Answer Relevancy**: How relevant the response is to the original query
diff --git a/backend/RAG_evaluation/tests/test_eight_metrics_e2e.py b/backend/RAG_evaluation/tests/test_eight_metrics_e2e.py
@@ -326,13 +326,13 @@ async def _verify_eight_metrics(self, page):
         # Search for actual display names as they appear in the Streamlit UI
         expected_metrics = [
             ('faithfulness', 'Faithfulness'),
-            ('answer_relevancy', 'Answer Relevancy'), 
             ('context_relevancy', 'Context Relevancy'),
+            ('answer_relevancy', 'Answer Relevancy'), 
             ('context_precision_without_reference', 'Context Precision Without Reference'),
+            ('context_recall', 'Context Recall'),
             ('context_precision', 'Context Precision'),
             ('answer_similarity', 'Answer Similarity'),
-            ('answer_correctness', 'Answer Correctness'),
-            ('context_recall', 'Context Recall')
+            ('answer_correctness', 'Answer Correctness')
         ]
         
         found_metrics = []
diff --git a/backend/entrypoint.sh b/backend/entrypoint.sh
@@ -4,7 +4,9 @@
 set -e
 
 echo "Waiting for MySQL..."
-while ! nc -z db 3306; do
+DB_HOST=${MYSQL_SERVER:-db}
+DB_PORT=${MYSQL_PORT:-3306}
+while ! nc -z $DB_HOST $DB_PORT; do
   sleep 1
 done
 echo "MySQL started"