fix: whisper transcription test use github url + update test (#8455)

davidsbatista · web-flow · commit 5867fa1f34e1 · 2024-10-14T16:24:52.000+02:00
* adding audio file

* changing URL

* updating tests

* temporary removing failing test

* updating tests

* removing failing test

* typo

* linting

* fixing URL

* updating tests
diff --git a/test/components/audio/test_whisper_local.py b/test/components/audio/test_whisper_local.py
@@ -204,3 +204,22 @@ def test_whisper_local_transcriber(self, test_files_path):
         assert docs[2].content.strip().lower() == "answer."
         # meta.audio_file should contain the temp path where we dumped the audio bytes
         assert docs[2].meta["audio_file"]
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="ffmpeg not installed on Windows CI")
+    def test_whisper_local_transcriber_pipeline_and_url_source(self):
+        pipe = Pipeline()
+        pipe.add_component("fetcher", LinkContentFetcher())
+        pipe.add_component("transcriber", LocalWhisperTranscriber(model="tiny"))
+
+        pipe.connect("fetcher", "transcriber")
+        result = pipe.run(
+            data={
+                "fetcher": {
+                    "urls": [
+                        "https://github.com/deepset-ai/haystack/raw/refs/heads/main/test/test_files/audio/MLK_Something_happening.mp3"  # noqa: E501
+                    ]
+                }
+            }
+        )
+        assert "masses of people" in result["transcriber"]["documents"][0].content
diff --git a/test/components/audio/test_whisper_remote.py b/test/components/audio/test_whisper_remote.py
@@ -5,7 +5,6 @@
 import pytest
 
 from haystack import Pipeline
-from haystack.components.audio import LocalWhisperTranscriber
 from haystack.components.audio.whisper_remote import RemoteWhisperTranscriber
 from haystack.components.fetchers import LinkContentFetcher
 from haystack.dataclasses import ByteStream
@@ -100,7 +99,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch):
             },
         }
 
-    def test_from_dict_with_defualt_parameters(self, monkeypatch):
+    def test_from_dict_with_default_parameters(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test_api_key")
 
         data = {
@@ -147,7 +146,7 @@ def test_from_dict_with_custom_init_parameters(self, monkeypatch):
             "temperature": "0.5",
         }
 
-    def test_from_dict_with_defualt_parameters_no_env_var(self, monkeypatch):
+    def test_from_dict_with_default_parameters_no_env_var(self, monkeypatch):
         monkeypatch.delenv("OPENAI_API_KEY", raising=False)
 
         data = {
@@ -189,3 +188,25 @@ def test_whisper_remote_transcriber(self, test_files_path):
         assert str(test_files_path / "audio" / "the context for this answer is here.wav") == docs[1].meta["file_path"]
 
         assert docs[2].content.strip().lower() == "answer."
+
+    @pytest.mark.skipif(
+        not os.environ.get("OPENAI_API_KEY", None),
+        reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
+    )
+    @pytest.mark.integration
+    def test_whisper_remote_transcriber_pipeline_and_url_source(self):
+        pipe = Pipeline()
+        pipe.add_component("fetcher", LinkContentFetcher())
+        pipe.add_component("transcriber", RemoteWhisperTranscriber())
+
+        pipe.connect("fetcher", "transcriber")
+        result = pipe.run(
+            data={
+                "fetcher": {
+                    "urls": [
+                        "https://github.com/deepset-ai/haystack/raw/refs/heads/main/test/test_files/audio/MLK_Something_happening.mp3"
+                    ]  # noqa: E501
+                }
+            }
+        )
+        assert "masses of people" in result["transcriber"]["documents"][0].content