Open AI voice admin

2025-09-17 22:44:12 -06:00 · 2025-09-17 22:44:12 -06:00 · db61f56a24
commit db61f56a24
parent 88fe85c802
2 changed files with 256 additions and 159 deletions
--- a/pxy_openai/assistants.py
+++ b/pxy_openai/assistants.py
@ -1,4 +1,8 @@
+# pxy_openai/assistants.py
 import logging
+import tempfile
+import requests  # NEW
+from typing import Optional  # NEW
 from .client import OpenAIClient
 from .models import OpenAIAssistant as OpenAIAssistantModel

@ -6,30 +10,81 @@ logger = logging.getLogger(__name__)

 class OpenAIAssistant:
    """
-    OpenAI Assistant for handling AI interactions.
+    OpenAI Assistant for handling AI interactions (chat + voice).
    """
    def __init__(self, name):
-        """
-        Initialize the assistant by loading its configuration from the database.
-        """
        try:
            self.config = OpenAIAssistantModel.objects.get(name=name)
            self.client = OpenAIClient(self.config.api_key).get_client()
        except OpenAIAssistantModel.DoesNotExist:
            raise ValueError(f"Assistant '{name}' not found in the database.")

+    # ---------- NEW: Whisper helpers ----------
+
+    def transcribe_file(self, path: str, language: Optional[str] = "es") -> str:
+        """
+        Transcribe a local audio file using Whisper. Returns plain text.
+        Supports both new OpenAI SDK (client.audio.transcriptions.create)
+        and legacy (openai.Audio.transcriptions.create).
+        """
+        try:
+            # New SDK path
+            if hasattr(self.client, "audio") and hasattr(self.client.audio, "transcriptions"):
+                with open(path, "rb") as f:
+                    tx = self.client.audio.transcriptions.create(
+                        model="whisper-1",
+                        file=f,
+                        response_format="text",
+                        language=language or None,
+                    )
+                return tx.strip() if isinstance(tx, str) else str(tx)
+
+            # Legacy SDK fallback
+            with open(path, "rb") as f:
+                tx = self.client.Audio.transcriptions.create(  # type: ignore[attr-defined]
+                    model="whisper-1",
+                    file=f,
+                    response_format="text",
+                    language=language or None,
+                )
+            return tx.strip() if isinstance(tx, str) else str(tx)
+
+        except Exception as e:
+            logger.error(f"Whisper transcription error: {e}")
+            raise
+
+    def transcribe_telegram(self, bot_token: str, file_id: str, language: Optional[str] = "es") -> str:
+        """
+        Download a Telegram voice/audio by file_id and transcribe it.
+        """
+        # 1) getFile
+        r = requests.get(
+            f"https://api.telegram.org/bot{bot_token}/getFile",
+            params={"file_id": file_id},
+            timeout=10,
+        )
+        r.raise_for_status()
+        file_path = r.json()["result"]["file_path"]
+
+        # 2) download actual bytes
+        url = f"https://api.telegram.org/file/bot{bot_token}/{file_path}"
+        with tempfile.NamedTemporaryFile(delete=False, suffix="." + file_path.split(".")[-1]) as tmp:
+            resp = requests.get(url, timeout=30)
+            resp.raise_for_status()
+            tmp.write(resp.content)
+            local_path = tmp.name
+
+        # 3) transcribe
+        return self.transcribe_file(local_path, language=language)
+
+    # ---------- existing chat/agents methods ----------
+
    def chat_completion(self, user_message):
-        """
-        Call OpenAI's chat completion API.
-        """
        try:
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
-                    {
-                        "role": "system",
-                        "content": self.config.description,  # Use description as the system prompt
-                    },
+                    {"role": "system", "content": self.config.description},
                    {"role": "user", "content": user_message},
                ],
            )
@ -39,23 +94,15 @@ class OpenAIAssistant:
            return f"Error in chat completion: {e}"

    def agent_workflow(self, user_message):
-        """
-        Call OpenAI's advanced agent workflow API.
-        """
        try:
            if not self.config.assistant_id:
                raise ValueError(f"Assistant '{self.config.name}' does not have an associated assistant ID.")

            assistant = self.client.beta.assistants.retrieve(self.config.assistant_id)
            thread = self.client.beta.threads.create()
-
-            # Create a message in the thread
            self.client.beta.threads.messages.create(thread_id=thread.id, role="user", content=user_message)
-
-            # Run the assistant workflow
            run = self.client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant.id)

-            # Poll for the result
            while run.status in ["queued", "in_progress"]:
                run = self.client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
                if run.status == "completed":
@ -68,9 +115,6 @@ class OpenAIAssistant:
            return f"Error in agent workflow: {e}"

    def handle_message(self, user_message):
-        """
-        Automatically select the correct method based on assistant type.
-        """
        if self.config.is_special_assistant():
            return self.agent_workflow(user_message)
        return self.chat_completion(user_message)
--- a/pxy_openai/views.py
+++ b/pxy_openai/views.py
@ -1,176 +1,229 @@
 # pxy_openai/views.py
-import os
-import io
 import json
+import logging
 import tempfile
-import requests
-import openai
+from typing import Any, Dict, Optional

-from django.http import JsonResponse, HttpResponseBadRequest
+from django.http import JsonResponse, HttpResponse
 from django.views.decorators.csrf import csrf_exempt
+from django.apps import apps
+from django.utils.text import slugify

-from pxy_bots.models import TelegramBot  # to fetch the bot token from DB
+from .assistants import OpenAIAssistant

-# Configure OpenAI
-openai.api_key = os.getenv("OPENAI_API_KEY")
+logger = logging.getLogger(__name__)

-# Where to forward the transcript for chat
-LANGCHAIN_CHAT_URL = os.getenv(
-    "LANGCHAIN_CHAT_URL",
-    "http://app.polisplexity.tech:8010/api/langchain/chat"
-)

-def _download_telegram_file(bot_username: str, file_id: str) -> bytes:
+# -----------------------
+# Helpers
+# -----------------------
+
+def _parse_json(request) -> Dict[str, Any]:
+    try:
+        if request.content_type and "application/json" in request.content_type:
+            return json.loads(request.body.decode("utf-8") or "{}")
+        # allow x-www-form-urlencoded fallback
+        if request.POST and request.POST.get("payload"):
+            return json.loads(request.POST["payload"])
+    except Exception as e:
+        logger.warning("openai.api.bad_json: %s", e)
+    return {}
+
+def _render_text(text: str) -> Dict[str, Any]:
+    return {
+        "schema_version": "render.v1",
+        "messages": [{"type": "text", "text": str(text)}],
+    }
+
+def _render_error(text: str, status: int = 400) -> JsonResponse:
+    # Return as render.v1 for bot consumption (keeps UX consistent)
+    return JsonResponse(_render_text(f"⚠️ {text}"), status=status)
+
+def _get_bot_and_token(bot_username: Optional[str]) -> Optional[str]:
    """
-    Resolve a Telegram file_id to bytes using the bot's token.
-    Raises ValueError on any failure.
+    Find Telegram bot token by either TelegramBot.name or .username matching the provided bot_username.
+    Returns token or None.
    """
-    bot = TelegramBot.objects.filter(username=bot_username, is_active=True).first()
-    if not bot:
-        raise ValueError(f"bot '{bot_username}' not found or inactive")
+    if not bot_username:
+        return None
+    TelegramBot = apps.get_model("pxy_bots", "TelegramBot")
+    bot = (TelegramBot.objects.filter(name=bot_username, is_active=True).first() or
+           TelegramBot.objects.filter(username=bot_username, is_active=True).first())
+    return bot.token if bot else None

-    tg_api = f"https://api.telegram.org/bot{bot.token}"
-    r = requests.get(f"{tg_api}/getFile", params={"file_id": file_id}, timeout=10)
-    if r.status_code != 200 or not r.json().get("ok"):
-        raise ValueError("telegram getFile failed")
-
-    file_path = r.json()["result"]["file_path"]
-    file_url = f"https://api.telegram.org/file/bot{bot.token}/{file_path}"
-    dl = requests.get(file_url, timeout=20)
-    if dl.status_code != 200:
-        raise ValueError("telegram file download failed")
-    return dl.content
-
-
-def _transcribe_bytes(raw: bytes, language: str = "es") -> str:
+def _assistant_from_payload(env: Dict[str, Any]) -> Optional[OpenAIAssistant]:
    """
-    Transcribe OGG/Opus (or other) audio bytes with OpenAI.
-    Returns plain text.
+    Build OpenAIAssistant from a provided name in payload, otherwise try a sensible default.
+    You can pass "assistant": "Urbanista" in the root payload.
    """
-    # Write to a temp file so OpenAI client can stream it
-    with tempfile.NamedTemporaryFile(suffix=".ogg") as tmp:
-        tmp.write(raw)
-        tmp.flush()
-        with open(tmp.name, "rb") as fh:
-            # "gpt-4o-transcribe" or "whisper-1" depending on your account
-            result = openai.audio.transcriptions.create(
-                model="gpt-4o-transcribe",
-                file=fh,
-                response_format="text",
-                language=language or "es"
-            )
-    return (result.strip() if isinstance(result, str) else str(result)).strip()
+    name = (env.get("assistant") or "Urbanista")
+    try:
+        return OpenAIAssistant(name=name)
+    except Exception as e:
+        logger.error("openai.assistant.init_failed name=%s err=%s", name, e)
+        return None


+# -----------------------
+# /api/openai/transcribe
+# -----------------------
+
@csrf_exempt
 def transcribe(request):
    """
-    POST /api/openai/transcribe
-    Accepts req.v1. If input.media.file_id exists, we fetch from Telegram and transcribe.
-    If not, we fall back to input.text (handy for quick tests).
-    Returns:
-      {"schema_version":"proc.v1","kind":"transcript","text":"...","meta":{...}}
+    Transcribe audio to text.
+
+    JSON mode (Telegram voice):
+      {
+        "schema_version":"req.v1",
+        "bot":{"username":"PepeBasuritaCoinsBot"},
+        "user":{"language":"es"},
+        "input":{"media":{"type":"voice","file_id":"..."}}
+        "assistant":"Urbanista"   # optional
+      }
+
+    Multipart mode (direct upload):
+      POST multipart/form-data with file=<audio file>, and optional fields:
+        assistant=Urbanista
+        language=es
    """
    if request.method != "POST":
-        return HttpResponseBadRequest("POST only")
+        return HttpResponse(status=405)
+
+    # Multipart direct upload?
+    if request.FILES.get("file"):
+        audio = request.FILES["file"]
+        language = request.POST.get("language") or "es"
+        assistant_name = request.POST.get("assistant") or "Urbanista"
+        try:
+            assistant = OpenAIAssistant(assistant_name)
+        except Exception as e:
+            return _render_error(f"No pude iniciar el asistente '{assistant_name}': {e}", status=500)
+
+        # Save to temp file and run whisper
+        try:
+            suffix = "." + (audio.name.split(".")[-1] if "." in audio.name else "ogg")
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                for chunk in audio.chunks():
+                    tmp.write(chunk)
+                tmp_path = tmp.name
+            text = assistant.transcribe_file(tmp_path, language=language)
+            return JsonResponse(_render_text(text))
+        except Exception as e:
+            logger.exception("openai.transcribe.upload_error")
+            return _render_error(f"No se pudo transcribir el audio subido: {e}", status=500)
+
+    # JSON mode
+    env = _parse_json(request)
+    assistant = _assistant_from_payload(env)
+    if not assistant:
+        return _render_error("No pude cargar el asistente de OpenAI (revisa el nombre o la configuración).", status=500)
+
+    user_lang = ((env.get("user") or {}).get("language")) or "es"
+    media = ((env.get("input") or {}).get("media") or {})
+    file_id = media.get("file_id")
+    bot_username = ((env.get("bot") or {}).get("username"))
+
+    if not file_id:
+        return _render_error("No encontré audio en la petición (falta media.file_id o file).", status=400)
+
+    token = _get_bot_and_token(bot_username)
+    if not token:
+        return _render_error("No encontré el bot o su token para descargar el audio (bot.username).", status=400)

    try:
-        data = json.loads(request.body.decode("utf-8") or "{}")
-    except Exception:
-        return HttpResponseBadRequest("invalid json")
+        text = assistant.transcribe_telegram(token, file_id, language=user_lang)
+        return JsonResponse(_render_text(text))
+    except Exception as e:
+        logger.exception("openai.transcribe.telegram_error")
+        return _render_error(f"No se pudo transcribir el audio: {e}", status=500)

-    bot_username = ((data.get("bot") or {}).get("username"))
-    media = ((data.get("input") or {}).get("media")) or {}
-    language = ((data.get("user") or {}).get("language")) or "es"
-    text_fallback = ((data.get("input") or {}).get("text")) or ""
-
-    if not bot_username:
-        return JsonResponse({"error": "missing bot.username"}, status=400)
-
-    transcript = None
-    if media and media.get("file_id"):
-        try:
-            blob = _download_telegram_file(bot_username, media["file_id"])
-            transcript = _transcribe_bytes(blob, language=language)
-        except Exception as e:
-            return JsonResponse({"error": f"transcription failed: {e}"}, status=502)
-
-    if not transcript:
-        # Fallback to provided text so you can test without a voice note
-        if not text_fallback:
-            return JsonResponse({"error": "missing voice file_id or text fallback"}, status=400)
-        transcript = text_fallback
-
-    return JsonResponse({
-        "schema_version": "proc.v1",
-        "kind": "transcript",
-        "text": transcript,
-        "meta": {
-            "language": language,
-            "has_media": bool(media and media.get("file_id")),
-        }
-    })

+# -----------------------
+# /api/openai/voice_chat
+# -----------------------

@csrf_exempt
 def voice_chat(request):
    """
-    POST /api/openai/voice_chat
-    - Transcribe Telegram voice/audio from req.v1
-    - Forward a modified req.v1 (with input.text = transcript) to LangChain chat
-    - Return the LangChain render.v1 response (pass-through)
+    Transcribe (if voice present) and then chat with the transcript.
+
+    JSON (Telegram):
+      {
+        "schema_version":"req.v1",
+        "bot":{"username":"PepeBasuritaCoinsBot"},
+        "user":{"id":999,"language":"es"},
+        "command":{"name":"__voice__","version":1,"trigger":"text_command"},
+        "input":{"media":{"type":"voice","file_id":"..."}, "text":"(optional prompt)"},
+        "assistant":"Urbanista"  # optional
+      }
+
+    Multipart (direct upload + prompt):
+      POST multipart/form-data with:
+        file=<audio>, prompt="...", assistant="Urbanista", language="es"
    """
    if request.method != "POST":
-        return HttpResponseBadRequest("POST only")
+        return HttpResponse(status=405)

-    try:
-        data = json.loads(request.body.decode("utf-8") or "{}")
-    except Exception:
-        return HttpResponseBadRequest("invalid json")
-
-    bot_username = ((data.get("bot") or {}).get("username"))
-    media = ((data.get("input") or {}).get("media")) or {}
-    language = ((data.get("user") or {}).get("language")) or "es"
-    text_fallback = ((data.get("input") or {}).get("text")) or ""
-
-    if not bot_username:
-        return JsonResponse({"error": "missing bot.username"}, status=400)
-
-    transcript = None
-    if media and media.get("file_id"):
+    # Multipart mode (file + optional prompt)
+    if request.FILES.get("file"):
+        audio = request.FILES["file"]
+        prompt = request.POST.get("prompt") or ""
+        language = request.POST.get("language") or "es"
+        assistant_name = request.POST.get("assistant") or "Urbanista"
        try:
-            blob = _download_telegram_file(bot_username, media["file_id"])
-            transcript = _transcribe_bytes(blob, language=language)
+            assistant = OpenAIAssistant(assistant_name)
        except Exception as e:
-            return JsonResponse({"error": f"transcription failed: {e}"}, status=502)
+            return _render_error(f"No pude iniciar el asistente '{assistant_name}': {e}", status=500)

-    if not transcript:
-        if not text_fallback:
-            return JsonResponse({"error": "missing voice file_id or text fallback"}, status=400)
-        transcript = text_fallback
+        try:
+            suffix = "." + (audio.name.split(".")[-1] if "." in audio.name else "ogg")
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                for chunk in audio.chunks():
+                    tmp.write(chunk)
+                tmp_path = tmp.name

-    # Build a new req.v1 for LangChain: keep everything, set input.text to transcript
-    forward_payload = dict(data)
-    forward_payload.setdefault("input", {})
-    forward_payload["input"] = dict(forward_payload["input"])
-    forward_payload["input"]["text"] = transcript
-    # keep media in case downstream wants it; also annotate
-    forward_payload["input"]["_transcript"] = True
+            transcript = assistant.transcribe_file(tmp_path, language=language)
+            user_message = transcript if not prompt else f"{prompt}\n\n[Transcripción]\n{transcript}"
+            reply = assistant.chat_completion(user_message)
+            return JsonResponse(_render_text(reply))
+        except Exception as e:
+            logger.exception("openai.voice_chat.upload_error")
+            return _render_error(f"No se pudo procesar el audio: {e}", status=500)
+
+    # JSON mode (Telegram)
+    env = _parse_json(request)
+    assistant = _assistant_from_payload(env)
+    if not assistant:
+        return _render_error("No pude cargar el asistente de OpenAI (revisa el nombre o la configuración).", status=500)
+
+    user_lang = ((env.get("user") or {}).get("language")) or "es"
+    media = ((env.get("input") or {}).get("media") or {})
+    file_id = media.get("file_id")
+    prompt = ((env.get("input") or {}).get("text")) or ""
+
+    # If no audio present, just chat using provided text (keeps endpoint usable)
+    if not file_id:
+        if not prompt:
+            return JsonResponse(_render_text("¡Hola! 🌆 ¿Sobre qué te gustaría hablar hoy?"))
+        try:
+            reply = assistant.chat_completion(prompt)
+            return JsonResponse(_render_text(reply))
+        except Exception as e:
+            logger.exception("openai.voice_chat.text_only_error")
+            return _render_error(f"No se pudo generar respuesta: {e}", status=500)
+
+    # With Telegram voice
+    bot_username = ((env.get("bot") or {}).get("username"))
+    token = _get_bot_and_token(bot_username)
+    if not token:
+        return _render_error("No encontré el bot o su token para descargar el audio (bot.username).", status=400)

    try:
-        r = requests.post(
-            LANGCHAIN_CHAT_URL,
-            headers={"Content-Type": "application/json"},
-            data=json.dumps(forward_payload, ensure_ascii=False).encode("utf-8"),
-            timeout=30
-        )
+        transcript = assistant.transcribe_telegram(token, file_id, language=user_lang)
+        user_message = transcript if not prompt else f"{prompt}\n\n[Transcripción]\n{transcript}"
+        reply = assistant.chat_completion(user_message)
+        return JsonResponse(_render_text(reply))
    except Exception as e:
-        return JsonResponse({"error": f"forward to langchain failed: {e}"}, status=502)
-
-    # Proxy through the downstream response (expecting render.v1)
-    try:
-        body = r.json()
-    except Exception:
-        body = {"schema_version": "render.v1",
-                "messages": [{"type": "text", "text": r.text[:1000]}]}
-    return JsonResponse(body, status=r.status_code or 200, safe=False)
+        logger.exception("openai.voice_chat.telegram_error")
+        return _render_error(f"No se pudo procesar el audio: {e}", status=500)