Whisper Open AI connections

2025-09-17 20:29:33 -06:00 · 2025-09-17 20:29:33 -06:00 · 88fe85c802
commit 88fe85c802
parent 3fa732efbc
6 changed files with 198 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -31,3 +31,4 @@ Dockerfile.dev
 docker-compose.override.yml
 docker-compose.override.yml
 pxy_meta_pages.zip
+pxy_openai.zip
--- a/polisplexity/urls.py
+++ b/polisplexity/urls.py
@ -48,6 +48,8 @@ urlpatterns = [
    path("api/", include("pxy_bots.api.urls")),
    path("api/langchain/", include("pxy_langchain.api.urls")),

+    path("", include("pxy_openai.urls")),
+


 ]
--- a/pxy_dashboard/middleware.py
+++ b/pxy_dashboard/middleware.py
@ -105,6 +105,18 @@ EXEMPT_URLS += [
    re.compile(r"^api/langchain/chat/?$"),
 ]

+EXEMPT_URLS += [
+    re.compile(r"^api/openai/transcribe$"),
+]
+
+EXEMPT_URLS += [
+    re.compile(r"^api/openai/voice_chat$"),
+]
+
+
+
+
+

 class LoginRequiredMiddleware(MiddlewareMixin):
    def process_request(self, request):
--- a/pxy_openai/services.py
+++ b/pxy_openai/services.py
--- a/pxy_openai/urls.py
+++ b/pxy_openai/urls.py
@ -0,0 +1,8 @@
+# pxy_openai/urls.py
+from django.urls import path
+from .views import transcribe, voice_chat
+
+urlpatterns = [
+    path("api/openai/transcribe", transcribe, name="openai_transcribe"),
+    path("api/openai/voice_chat", voice_chat, name="openai_voice_chat"),
+]
--- a/pxy_openai/views.py
+++ b/pxy_openai/views.py
@ -1,3 +1,176 @@
-from django.shortcuts import render
+# pxy_openai/views.py
+import os
+import io
+import json
+import tempfile
+import requests
+import openai

-# Create your views here.
+from django.http import JsonResponse, HttpResponseBadRequest
+from django.views.decorators.csrf import csrf_exempt
+
+from pxy_bots.models import TelegramBot  # to fetch the bot token from DB
+
+# Configure OpenAI
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+# Where to forward the transcript for chat
+LANGCHAIN_CHAT_URL = os.getenv(
+    "LANGCHAIN_CHAT_URL",
+    "http://app.polisplexity.tech:8010/api/langchain/chat"
+)
+
+def _download_telegram_file(bot_username: str, file_id: str) -> bytes:
+    """
+    Resolve a Telegram file_id to bytes using the bot's token.
+    Raises ValueError on any failure.
+    """
+    bot = TelegramBot.objects.filter(username=bot_username, is_active=True).first()
+    if not bot:
+        raise ValueError(f"bot '{bot_username}' not found or inactive")
+
+    tg_api = f"https://api.telegram.org/bot{bot.token}"
+    r = requests.get(f"{tg_api}/getFile", params={"file_id": file_id}, timeout=10)
+    if r.status_code != 200 or not r.json().get("ok"):
+        raise ValueError("telegram getFile failed")
+
+    file_path = r.json()["result"]["file_path"]
+    file_url = f"https://api.telegram.org/file/bot{bot.token}/{file_path}"
+    dl = requests.get(file_url, timeout=20)
+    if dl.status_code != 200:
+        raise ValueError("telegram file download failed")
+    return dl.content
+
+
+def _transcribe_bytes(raw: bytes, language: str = "es") -> str:
+    """
+    Transcribe OGG/Opus (or other) audio bytes with OpenAI.
+    Returns plain text.
+    """
+    # Write to a temp file so OpenAI client can stream it
+    with tempfile.NamedTemporaryFile(suffix=".ogg") as tmp:
+        tmp.write(raw)
+        tmp.flush()
+        with open(tmp.name, "rb") as fh:
+            # "gpt-4o-transcribe" or "whisper-1" depending on your account
+            result = openai.audio.transcriptions.create(
+                model="gpt-4o-transcribe",
+                file=fh,
+                response_format="text",
+                language=language or "es"
+            )
+    return (result.strip() if isinstance(result, str) else str(result)).strip()
+
+
+@csrf_exempt
+def transcribe(request):
+    """
+    POST /api/openai/transcribe
+    Accepts req.v1. If input.media.file_id exists, we fetch from Telegram and transcribe.
+    If not, we fall back to input.text (handy for quick tests).
+    Returns:
+      {"schema_version":"proc.v1","kind":"transcript","text":"...","meta":{...}}
+    """
+    if request.method != "POST":
+        return HttpResponseBadRequest("POST only")
+
+    try:
+        data = json.loads(request.body.decode("utf-8") or "{}")
+    except Exception:
+        return HttpResponseBadRequest("invalid json")
+
+    bot_username = ((data.get("bot") or {}).get("username"))
+    media = ((data.get("input") or {}).get("media")) or {}
+    language = ((data.get("user") or {}).get("language")) or "es"
+    text_fallback = ((data.get("input") or {}).get("text")) or ""
+
+    if not bot_username:
+        return JsonResponse({"error": "missing bot.username"}, status=400)
+
+    transcript = None
+    if media and media.get("file_id"):
+        try:
+            blob = _download_telegram_file(bot_username, media["file_id"])
+            transcript = _transcribe_bytes(blob, language=language)
+        except Exception as e:
+            return JsonResponse({"error": f"transcription failed: {e}"}, status=502)
+
+    if not transcript:
+        # Fallback to provided text so you can test without a voice note
+        if not text_fallback:
+            return JsonResponse({"error": "missing voice file_id or text fallback"}, status=400)
+        transcript = text_fallback
+
+    return JsonResponse({
+        "schema_version": "proc.v1",
+        "kind": "transcript",
+        "text": transcript,
+        "meta": {
+            "language": language,
+            "has_media": bool(media and media.get("file_id")),
+        }
+    })
+
+
+@csrf_exempt
+def voice_chat(request):
+    """
+    POST /api/openai/voice_chat
+    - Transcribe Telegram voice/audio from req.v1
+    - Forward a modified req.v1 (with input.text = transcript) to LangChain chat
+    - Return the LangChain render.v1 response (pass-through)
+    """
+    if request.method != "POST":
+        return HttpResponseBadRequest("POST only")
+
+    try:
+        data = json.loads(request.body.decode("utf-8") or "{}")
+    except Exception:
+        return HttpResponseBadRequest("invalid json")
+
+    bot_username = ((data.get("bot") or {}).get("username"))
+    media = ((data.get("input") or {}).get("media")) or {}
+    language = ((data.get("user") or {}).get("language")) or "es"
+    text_fallback = ((data.get("input") or {}).get("text")) or ""
+
+    if not bot_username:
+        return JsonResponse({"error": "missing bot.username"}, status=400)
+
+    transcript = None
+    if media and media.get("file_id"):
+        try:
+            blob = _download_telegram_file(bot_username, media["file_id"])
+            transcript = _transcribe_bytes(blob, language=language)
+        except Exception as e:
+            return JsonResponse({"error": f"transcription failed: {e}"}, status=502)
+
+    if not transcript:
+        if not text_fallback:
+            return JsonResponse({"error": "missing voice file_id or text fallback"}, status=400)
+        transcript = text_fallback
+
+    # Build a new req.v1 for LangChain: keep everything, set input.text to transcript
+    forward_payload = dict(data)
+    forward_payload.setdefault("input", {})
+    forward_payload["input"] = dict(forward_payload["input"])
+    forward_payload["input"]["text"] = transcript
+    # keep media in case downstream wants it; also annotate
+    forward_payload["input"]["_transcript"] = True
+
+    try:
+        r = requests.post(
+            LANGCHAIN_CHAT_URL,
+            headers={"Content-Type": "application/json"},
+            data=json.dumps(forward_payload, ensure_ascii=False).encode("utf-8"),
+            timeout=30
+        )
+    except Exception as e:
+        return JsonResponse({"error": f"forward to langchain failed: {e}"}, status=502)
+
+    # Proxy through the downstream response (expecting render.v1)
+    try:
+        body = r.json()
+    except Exception:
+        body = {"schema_version": "render.v1",
+                "messages": [{"type": "text", "text": r.text[:1000]}]}
+    return JsonResponse(body, status=r.status_code or 200, safe=False)