From 6e411e2a7725b9250d06cc8625a283b692fc2812 Mon Sep 17 00:00:00 2001 From: dtiberio Date: Fri, 13 Dec 2024 15:02:14 +0000 Subject: [PATCH] first commit --- .env copy | 4 + .gitignore | 2 + README.md | 34 +++- live_api_starter_cv.md | 112 +++++++++++++ live_api_starter_cv.py | 329 +++++++++++++++++++++++++++++++++++++++ live_api_starter_desk.md | 155 ++++++++++++++++++ live_api_starter_desk.py | 317 +++++++++++++++++++++++++++++++++++++ requirements.txt | 21 +++ 8 files changed, 972 insertions(+), 2 deletions(-) create mode 100644 .env copy create mode 100644 live_api_starter_cv.md create mode 100644 live_api_starter_cv.py create mode 100644 live_api_starter_desk.md create mode 100644 live_api_starter_desk.py create mode 100644 requirements.txt diff --git a/.env copy b/.env copy new file mode 100644 index 0000000..368965b --- /dev/null +++ b/.env copy @@ -0,0 +1,4 @@ +# update this with your private key +# you can get one from the Google AI Studio + +GEMINI_API_KEY = 'YOUR_PRIVATE_GEMINI_API_KEY' \ No newline at end of file diff --git a/.gitignore b/.gitignore index 68bc17f..b178d45 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +logs/ \ No newline at end of file diff --git a/README.md b/README.md index 8373ef2..ac4e64e 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,32 @@ -# Gemini_2.0_Live_API_Tutorials - Gemini_2.0_Live_API_Tutorials +# Gemini_2.0_Live_API_Tutorials + +These two Python files provide a demo of the newly release Google Gemini 2.0 Live API. + +## live_api_starter_cv.md +The Live API Starter is a Python application that implements real-time audio and video interaction with Google's Gemini AI model. It creates a bidirectional communication channel where users can send text, audio, and video input while receiving audio and text responses from the model in real-time. +The code shares the video from your webcam with the Gemini model, while you can also do a voice chat. + +## live_api_starter_desk.py +This application is a desktop assistant that combines audio input/output capabilities with screen capture functionality to interact with Google's Gemini API. It creates an interactive experience where users can communicate with the Gemini model through both voice and text while sharing their screen. +The code shares your desktop with the Gemini model, while you can also do a voice chat. + +# References: + https://github.com/google-gemini/cookbook/blob/main/gemini-2/README.md + https://github.com/google-gemini/cookbook/blob/main/gemini-2/live_api_starter.py + +## The new Gemini 2.0 Live API requires: + https://pypi.org/project/google-genai/ + https://github.com/googleapis/python-genai + +To learn more, see the Python SDK reference: + https://googleapis.github.io/python-genai/ + +# Possible bugs +During testing I've noticed that the Gemini model sometimes fails to "see" the webcam or the desktop when you request that via the text prompts, however, it usually works well if you make the same request via the voice prompt. +This might be due to the nature of the experimental release at the time of the tests. + +# Please note +The code is provided as-is for learning purposes, please don't expect any updates in the future. +I've made some changes to the code in the Google cookbook to add some logging and troubleshooting details. +This code was tested with Python 3.12 running on Windows 11. +Unfortuantely, I can't provide any support. diff --git a/live_api_starter_cv.md b/live_api_starter_cv.md new file mode 100644 index 0000000..5a0dbf1 --- /dev/null +++ b/live_api_starter_cv.md @@ -0,0 +1,112 @@ +# Live API Starter Documentation + +References: + https://github.com/google-gemini/cookbook/blob/main/gemini-2/README.md + https://github.com/google-gemini/cookbook/blob/main/gemini-2/live_api_starter.py + +The new Live API requires: + https://pypi.org/project/google-genai/ + https://github.com/googleapis/python-genai + +This PyPI package doesn't support Live API: + https://pypi.org/project/google-generativeai/ + https://github.com/google-gemini/generative-ai-python + +Notes as of 2024-12-13: + `google-generativeai` - old python sdk, for Gemini API in Google AI Studio only + `google-vertexai` - more complex, for Gemini LLM models in Google Vertex AI only + `google-genai` - new one sdk, for both VertexAI and Gemini API. Supports the Live API. + +## The NEW GenAI API: + +The new Google Gen AI SDK provides a unified interface to Gemini 2.0 through both the Gemini Developer API and the Gemini Enterprise API ( Vertex AI). +With a few exceptions, code that runs on one platform will run on both. The Gen AI SDK also supports the Gemini 1.5 models. + +Python +The Google Gen AI SDK for Python is available on PyPI and GitHub: + google-genai on PyPI --> `pip install google-genai` + python-genai on GitHub + +To learn more, see the Python SDK reference: + https://googleapis.github.io/python-genai/ + +Quickstart +1. Import libraries + ``` python + from google import genai + from google.genai import types + ``` +2. Create a client + ``` python + client = genai.Client(api_key='YOUR_API_KEY') + ``` +3. Generate content + ``` python + response = client.models.generate_content( + model='gemini-2.0-flash-exp', contents='What is your name?' + ) + print(response.text) + ``` + +## Overview +The Live API Starter is a Python application that implements real-time audio and video interaction with Google's Gemini AI model. It creates a bidirectional communication channel where users can send text, audio, and video input while receiving audio and text responses from the model in real-time. + +## Key Features +- Real-time audio input/output processing +- Video capture and streaming +- Text-based interaction +- Asynchronous operation +- Bidirectional communication with Gemini AI model + +## Class Documentation + +### AudioLoop +Main class that manages the audio/video streaming pipeline and communication with the Gemini AI model. + +## Method Documentation + +### AudioLoop.__init__ +Initializes queues for audio/video processing and sets up session management. + +### AudioLoop.send_text +Handles text input from the user and sends it to the Gemini session. + +### AudioLoop._get_frame +Captures and processes a single video frame, converting it to JPEG format with size constraints. + +### AudioLoop.get_frames +Continuously captures video frames from the default camera and adds them to the video queue. + +### AudioLoop.send_frames +Sends captured video frames to the Gemini session. + +### AudioLoop.listen_audio +Sets up and manages audio input stream from the microphone. + +### AudioLoop.send_audio +Sends audio chunks from the output queue to the Gemini session. + +### AudioLoop.receive_audio +Processes responses from the Gemini model, handling both text and audio data. + +### AudioLoop.play_audio +Manages audio playback of responses received from the model. + +### AudioLoop.run +Main execution method that coordinates all the async tasks and manages the session lifecycle. + +## Global Constants + +- FORMAT: Set to pyaudio.paInt16 for audio format +- CHANNELS: Set to 1 for mono audio +- SEND_SAMPLE_RATE: 16000Hz for input audio +- RECEIVE_SAMPLE_RATE: 24000Hz for output audio +- CHUNK_SIZE: 512 bytes for audio processing +- MODEL: Uses "models/gemini-2.0-flash-exp" for AI interactions + +## Technical Details +- Uses asyncio for concurrent operations +- Implements PyAudio for audio handling +- Uses OpenCV (cv2) for video capture +- Integrates with Google's Genai client +- Supports Python 3.11+ with fallback for earlier versions \ No newline at end of file diff --git a/live_api_starter_cv.py b/live_api_starter_cv.py new file mode 100644 index 0000000..8873560 --- /dev/null +++ b/live_api_starter_cv.py @@ -0,0 +1,329 @@ +import asyncio +import base64 +import io +import os +from dotenv import load_dotenv +import sys +import traceback +import logging +from datetime import datetime + +import cv2 +import pyaudio +import PIL.Image + +from google import genai + +# Set up logging +# Set up logging +def setup_logging(): + """Setup logging configuration with both file and console output""" + # Create logs directory if it doesn't exist + logs_dir = "logs" + if not os.path.exists(logs_dir): + os.makedirs(logs_dir) + + # Create timestamp for filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + log_filename = os.path.join(logs_dir, f"gemini_cv_{timestamp}.log") + + # Configure logging + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + # File handler with timestamp filename + logging.FileHandler(log_filename), + # # Console handler + # logging.StreamHandler() + ] + ) + + logger = logging.getLogger(__name__) + # Print just this one message to console so user knows where logs are going + print(f"Logging to file: {log_filename}") + logger.info(f"Logging started - Log file: {log_filename}") + return logger + +if sys.version_info < (3, 11, 0): + import taskgroup, exceptiongroup + asyncio.TaskGroup = taskgroup.TaskGroup + asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup + +# Configure Gemini API Key +# Load environment variables from the .env file +load_dotenv() +# Access the API key +GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") + +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +SEND_SAMPLE_RATE = 16000 +RECEIVE_SAMPLE_RATE = 24000 +CHUNK_SIZE = 512 + +MODEL = "models/gemini-2.0-flash-exp" + +client = genai.Client( + http_options={'api_version': 'v1alpha'}, + api_key=GEMINI_API_KEY + ) + +CONFIG={ + "generation_config": {"response_modalities": ["AUDIO"]}} + +pya = pyaudio.PyAudio() + +class AudioLoop: + def __init__(self): + self.audio_in_queue = asyncio.Queue() + self.audio_out_queue = asyncio.Queue() + self.video_out_queue = asyncio.Queue() + self.session = None + self.send_text_task = None + self.receive_audio_task = None + self.play_audio_task = None + logger.info("AudioLoop initialized") + + async def send_text(self): + while True: + text = await asyncio.to_thread(input, "message > ") + if text.lower() == "q": + break + await self.session.send(text or ".", end_of_turn=True) + + def _get_frame(self, cap): + try: + # Log camera properties + if cap.isOpened(): + logger.debug(f"Camera is open - Width: {cap.get(cv2.CAP_PROP_FRAME_WIDTH)}, Height: {cap.get(cv2.CAP_PROP_FRAME_HEIGHT)}") + else: + logger.error("Camera is not open") + return None + + # Read the frame + ret, frame = cap.read() + + # Check if the frame was read successfully + if not ret: + logger.error("Failed to read frame from camera") + return None + + logger.debug(f"Frame captured - Shape: {frame.shape}") + + # Convert to PIL Image + img = PIL.Image.fromarray(frame) + original_size = img.size + img.thumbnail([1024, 1024]) + logger.debug(f"Image resized from {original_size} to {img.size}") + + # Convert to bytes + image_io = io.BytesIO() + img.save(image_io, format="jpeg") + image_io.seek(0) + + mime_type = "image/jpeg" + image_bytes = image_io.read() + encoded_size = len(image_bytes) + logger.debug(f"Image encoded - Size: {encoded_size} bytes") + + return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()} + except Exception as e: + logger.error(f"Error in _get_frame: {str(e)}") + logger.error(traceback.format_exc()) + return None + + async def get_frames(self): + try: + logger.info("Attempting to open camera...") + cap = await asyncio.to_thread(cv2.VideoCapture, 0) + + if not cap.isOpened(): + logger.error("Failed to open camera") + return + + logger.info("Camera opened successfully") + frame_count = 0 + + while True: + frame = await asyncio.to_thread(self._get_frame, cap) + if frame is None: + logger.error("Frame capture failed") + break + + frame_count += 1 + if frame_count % 10 == 0: # Log every 10th frame + logger.debug(f"Captured frame {frame_count}") + + await asyncio.sleep(1.0) + + try: + self.video_out_queue.put_nowait(frame) + logger.debug(f"Frame {frame_count} added to queue") + except Exception as e: + logger.error(f"Error adding frame to queue: {str(e)}") + + logger.info("Releasing camera...") + cap.release() + + except Exception as e: + logger.error(f"Error in get_frames: {str(e)}") + logger.error(traceback.format_exc()) + + async def send_frames(self): + frame_count = 0 + try: + while True: + frame = await self.video_out_queue.get() + frame_count += 1 + logger.debug(f"Sending frame {frame_count} to session") + + try: + await self.session.send(frame) + logger.debug(f"Frame {frame_count} sent successfully") + except Exception as e: + logger.error(f"Error sending frame {frame_count}: {str(e)}") + except Exception as e: + logger.error(f"Error in send_frames: {str(e)}") + logger.error(traceback.format_exc()) + + async def listen_audio(self): + logger.info("Starting audio listening...") + try: + pya = pyaudio.PyAudio() + mic_info = pya.get_default_input_device_info() + logger.debug(f"Using microphone: {mic_info['name']}") + + stream = await asyncio.to_thread( + pya.open, + format=FORMAT, + channels=CHANNELS, + rate=SEND_SAMPLE_RATE, + input=True, + input_device_index=mic_info["index"], + frames_per_buffer=CHUNK_SIZE, + ) + logger.info("Audio stream opened successfully") + + while True: + data = await asyncio.to_thread(stream.read, CHUNK_SIZE) + self.audio_out_queue.put_nowait(data) + except Exception as e: + logger.error(f"Error in listen_audio: {str(e)}") + logger.error(traceback.format_exc()) + + async def send_audio(self): + try: + chunk_count = 0 + while True: + chunk = await self.audio_out_queue.get() + chunk_count += 1 + if chunk_count % 100 == 0: # Log every 100th chunk + logger.debug(f"Sending audio chunk {chunk_count}") + await self.session.send({"data": chunk, "mime_type": "audio/pcm"}) + except Exception as e: + logger.error(f"Error in send_audio: {str(e)}") + logger.error(traceback.format_exc()) + + async def receive_audio(self): + try: + while True: + async for response in self.session.receive(): + server_content = response.server_content + if server_content is not None: + model_turn = server_content.model_turn + if model_turn is not None: + parts = model_turn.parts + + for part in parts: + if part.text is not None: + print(part.text, end="") + elif part.inline_data is not None: + self.audio_in_queue.put_nowait(part.inline_data.data) + + server_content.model_turn = None + turn_complete = server_content.turn_complete + if turn_complete: + logger.debug("Turn complete received") + while not self.audio_in_queue.empty(): + self.audio_in_queue.get_nowait() + except Exception as e: + logger.error(f"Error in receive_audio: {str(e)}") + logger.error(traceback.format_exc()) + + async def play_audio(self): + try: + logger.info("Starting audio playback...") + pya = pyaudio.PyAudio() + stream = await asyncio.to_thread( + pya.open, format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True + ) + logger.info("Audio playback stream opened successfully") + + while True: + bytestream = await self.audio_in_queue.get() + await asyncio.to_thread(stream.write, bytestream) + except Exception as e: + logger.error(f"Error in play_audio: {str(e)}") + logger.error(traceback.format_exc()) + + async def run(self): + logger.info("Starting AudioLoop.run()") + try: + async with ( + client.aio.live.connect(model=MODEL, config=CONFIG) as session, + asyncio.TaskGroup() as tg, + ): + self.session = session + logger.info("Session connected successfully") + + send_text_task = tg.create_task(self.send_text()) + + def cleanup(task): + logger.info("Cleaning up tasks...") + for t in tg._tasks: + t.cancel() + logger.info("Tasks cleanup complete") + + send_text_task.add_done_callback(cleanup) + + # Create all tasks + tasks = [ + tg.create_task(self.listen_audio()), + tg.create_task(self.send_audio()), + tg.create_task(self.get_frames()), + tg.create_task(self.send_frames()), + tg.create_task(self.receive_audio()), + tg.create_task(self.play_audio()) + ] + + def check_error(task): + if task.cancelled(): + logger.debug(f"Task {task.get_name()} was cancelled") + return + + if task.exception() is not None: + e = task.exception() + logger.error(f"Task {task.get_name()} failed with exception:") + logger.error(traceback.format_exception(None, e, e.__traceback__)) + sys.exit(1) + + for task in tg._tasks: + task.add_done_callback(check_error) + + except Exception as e: + logger.error(f"Error in run: {str(e)}") + logger.error(traceback.format_exc()) + +if __name__ == "__main__": + logger = setup_logging() + logger.info("Starting application...") + print ("Application started, type 'q' and press Enter to exit.") + try: + main = AudioLoop() + asyncio.run(main.run()) + except KeyboardInterrupt: + logger.info("Application stopped by user") + except Exception as e: + logger.error(f"Application error: {str(e)}") + logger.error(traceback.format_exc()) \ No newline at end of file diff --git a/live_api_starter_desk.md b/live_api_starter_desk.md new file mode 100644 index 0000000..eee1c23 --- /dev/null +++ b/live_api_starter_desk.md @@ -0,0 +1,155 @@ +# Gemini Live API Desktop Assistant + +This application is a desktop assistant that combines audio input/output capabilities with screen capture functionality to interact with Google's Gemini API. It creates an interactive experience where users can communicate with the Gemini model through both voice and text while sharing their screen. + +## Features + +- Real-time audio input and output +- Screen capture and streaming +- Text-based chat interface +- Asynchronous processing +- Comprehensive logging system + +## Technical Overview + +### Main Components + +#### AudioLoop Class +The primary class that manages all audio, video, and interaction functionality. + +##### Key Attributes: +- `audio_in_queue`: AsyncIO queue for incoming audio +- `audio_out_queue`: AsyncIO queue for outgoing audio +- `video_out_queue`: AsyncIO queue for screen capture frames +- `session`: Manages the connection to Gemini API + +##### Methods: + +1. `__init__()` + - Initializes queues and session variables + - Sets up basic configuration for audio and video processing + +2. `send_text()` + - Handles text input from the user + - Allows for text-based interaction with the model + - Processes quit command ('q') + +3. `_get_screen_frame()` + - Captures screen using PIL + - Processes and resizes image to meet Gemini API requirements + - Converts image to JPEG format and base64 encodes it + - Returns formatted frame data + +4. `get_frames()` + - Asynchronously captures screen frames + - Manages frame capture rate (1 FPS) + - Handles error logging and recovery + +5. `send_frames()` + - Sends captured frames to Gemini session + - Manages frame queue and transmission + - Handles error logging + +6. `listen_audio()` + - Initializes audio input stream + - Captures microphone input + - Processes audio chunks + +7. `send_audio()` + - Sends audio data to Gemini API + - Manages audio transmission queue + - Handles chunked audio data + +8. `receive_audio()` + - Processes responses from Gemini API + - Handles both text and audio responses + - Manages response queuing + +9. `play_audio()` + - Manages audio output stream + - Plays received audio responses + - Handles audio playback queue + +10. `run()` + - Main execution method + - Creates and manages all async tasks + - Handles session lifecycle and cleanup + +### Utility Functions + +#### setup_logging() +- Creates logging directory and timestamped log files +- Configures logging format and handlers +- Returns configured logger instance + +### Technical Specifications + +- Audio Format: 16-bit PCM +- Audio Channels: Mono (1 channel) +- Input Sample Rate: 16000 Hz +- Output Sample Rate: 24000 Hz +- Chunk Size: 512 bytes +- Screen Capture: 1 FPS +- Image Format: JPEG (quality: 80) +- Maximum Image Dimensions: 1024x1024 + +## Requirements + +- Python 3.11+ (for native `asyncio.TaskGroup`) +- PyAudio +- PIL (Python Imaging Library) +- Google Generative AI Python SDK +- Base64 +- AsyncIO + +## Configuration + +The application uses the following key configurations: + +```python +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +SEND_SAMPLE_RATE = 16000 +RECEIVE_SAMPLE_RATE = 24000 +CHUNK_SIZE = 512 +MODEL = "models/gemini-2.0-flash-exp" +``` + +## Error Handling + +- Comprehensive error logging +- Task-specific error callbacks +- Graceful cleanup on failure +- Automatic retry mechanisms + +## Logging System + +The application implements a robust logging system that: +- Creates timestamped log files +- Logs to both file and console +- Captures different log levels (DEBUG, INFO, ERROR) +- Provides detailed error tracebacks + +## Usage + +1. Set up your Gemini API key: +```python +GEMINI_API_KEY = 'your_api_key_here' +``` + +2. Run the application: +```bash +python live_api_starter_desk.py +``` + +3. Interact using: + - Voice through your default microphone + - Text by typing in the console + - Type 'q' to quit + +## Notes + +- The application requires appropriate microphone permissions +- Screen capture may require additional permissions on some systems +- Memory usage should be monitored during extended sessions +- Network bandwidth usage can be significant due to continuous audio/video streaming \ No newline at end of file diff --git a/live_api_starter_desk.py b/live_api_starter_desk.py new file mode 100644 index 0000000..c52d4e8 --- /dev/null +++ b/live_api_starter_desk.py @@ -0,0 +1,317 @@ +import asyncio +import base64 +import io +import sys +import traceback +import logging +from dotenv import load_dotenv +import os +from datetime import datetime + +import pyaudio +import PIL.Image +import PIL.ImageGrab + +from google import genai + +# Set up logging +def setup_logging(): + """Setup logging configuration with both file and console output""" + # Create logs directory if it doesn't exist + logs_dir = "logs" + if not os.path.exists(logs_dir): + os.makedirs(logs_dir) + + # Create timestamp for filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + log_filename = os.path.join(logs_dir, f"gemini_desk_{timestamp}.log") + + # Configure logging + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + # File handler with timestamp filename + logging.FileHandler(log_filename), + # # Console handler + # logging.StreamHandler() + ] + ) + + logger = logging.getLogger(__name__) + # Print just this one message to console so user knows where logs are going + print(f"Logging to file: {log_filename}") + logger.info(f"Logging started - Log file: {log_filename}") + return logger + +if sys.version_info < (3, 11, 0): + import taskgroup, exceptiongroup + asyncio.TaskGroup = taskgroup.TaskGroup + asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup + +# Configure Gemini API Key +# Load environment variables from the .env file +load_dotenv() +# Access the API key +GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") + +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +SEND_SAMPLE_RATE = 16000 +RECEIVE_SAMPLE_RATE = 24000 +CHUNK_SIZE = 512 + +MODEL = "models/gemini-2.0-flash-exp" + +client = genai.Client( + http_options={'api_version': 'v1alpha'}, + api_key=GEMINI_API_KEY + ) + +CONFIG={ + "generation_config": {"response_modalities": ["AUDIO"]}} + +pya = pyaudio.PyAudio() + + +class AudioLoop: + def __init__(self): + self.audio_in_queue = asyncio.Queue() + self.audio_out_queue = asyncio.Queue() + self.video_out_queue = asyncio.Queue() + self.session = None + self.send_text_task = None + self.receive_audio_task = None + self.play_audio_task = None + logger.info("AudioLoop initialized with screen capture") + + async def send_text(self): + while True: + text = await asyncio.to_thread(input, "message > ") + if text.lower() == "q": + break + await self.session.send(text or ".", end_of_turn=True) + + def _get_screen_frame(self): + """Capture and process a single screen frame using PIL""" + try: + # Capture the screen using PIL + screenshot = PIL.ImageGrab.grab() + logger.debug(f"Screenshot captured - Size: {screenshot.size}") + + # Resize to stay within Gemini's limits + original_size = screenshot.size + screenshot.thumbnail([1024, 1024]) + logger.debug(f"Image resized from {original_size} to {screenshot.size}") + + # Convert to JPEG + image_io = io.BytesIO() + screenshot.save(image_io, format="jpeg", quality=80) + image_io.seek(0) + + # Prepare frame data + mime_type = "image/jpeg" + image_bytes = image_io.read() + encoded_size = len(image_bytes) + logger.debug(f"Image encoded - Size: {encoded_size} bytes") + + return { + "mime_type": mime_type, + "data": base64.b64encode(image_bytes).decode() + } + + except Exception as e: + logger.error(f"Error in _get_screen_frame: {str(e)}") + logger.error(traceback.format_exc()) + return None + + async def get_frames(self): + """Capture frames asynchronously""" + try: + logger.info("Starting screen capture...") + frame_count = 0 + + while True: + try: + frame = await asyncio.to_thread(self._get_screen_frame) + if frame is None: + logger.error("Screen capture failed") + await asyncio.sleep(1.0) # Wait before retry + continue + + frame_count += 1 + if frame_count % 10 == 0: # Log every 10th frame + logger.debug(f"Captured screen frame {frame_count}") + + self.video_out_queue.put_nowait(frame) + logger.debug(f"Frame {frame_count} added to queue") + + except Exception as e: + logger.error(f"Error in frame capture loop: {str(e)}") + logger.error(traceback.format_exc()) + + await asyncio.sleep(1.0) # Capture rate: 1 frame per second + + except Exception as e: + logger.error(f"Error in get_frames: {str(e)}") + logger.error(traceback.format_exc()) + + async def send_frames(self): + """Send frames to the Gemini session""" + frame_count = 0 + try: + while True: + frame = await self.video_out_queue.get() + frame_count += 1 + logger.debug(f"Sending frame {frame_count} to session") + + try: + await self.session.send(frame) + logger.debug(f"Frame {frame_count} sent successfully") + except Exception as e: + logger.error(f"Error sending frame {frame_count}: {str(e)}") + except Exception as e: + logger.error(f"Error in send_frames: {str(e)}") + logger.error(traceback.format_exc()) + + # [Previous audio-related methods remain unchanged] + async def listen_audio(self): + logger.info("Starting audio listening...") + try: + pya = pyaudio.PyAudio() + mic_info = pya.get_default_input_device_info() + logger.debug(f"Using microphone: {mic_info['name']}") + + stream = await asyncio.to_thread( + pya.open, + format=FORMAT, + channels=CHANNELS, + rate=SEND_SAMPLE_RATE, + input=True, + input_device_index=mic_info["index"], + frames_per_buffer=CHUNK_SIZE, + ) + logger.info("Audio stream opened successfully") + + while True: + data = await asyncio.to_thread(stream.read, CHUNK_SIZE) + self.audio_out_queue.put_nowait(data) + except Exception as e: + logger.error(f"Error in listen_audio: {str(e)}") + logger.error(traceback.format_exc()) + + async def send_audio(self): + try: + chunk_count = 0 + while True: + chunk = await self.audio_out_queue.get() + chunk_count += 1 + if chunk_count % 100 == 0: # Log every 100th chunk + logger.debug(f"Sending audio chunk {chunk_count}") + await self.session.send({"data": chunk, "mime_type": "audio/pcm"}) + except Exception as e: + logger.error(f"Error in send_audio: {str(e)}") + logger.error(traceback.format_exc()) + + async def receive_audio(self): + try: + while True: + async for response in self.session.receive(): + server_content = response.server_content + if server_content is not None: + model_turn = server_content.model_turn + if model_turn is not None: + parts = model_turn.parts + + for part in parts: + if part.text is not None: + print(part.text, end="") + elif part.inline_data is not None: + self.audio_in_queue.put_nowait(part.inline_data.data) + + server_content.model_turn = None + turn_complete = server_content.turn_complete + if turn_complete: + logger.debug("Turn complete received") + while not self.audio_in_queue.empty(): + self.audio_in_queue.get_nowait() + except Exception as e: + logger.error(f"Error in receive_audio: {str(e)}") + logger.error(traceback.format_exc()) + + async def play_audio(self): + try: + logger.info("Starting audio playback...") + pya = pyaudio.PyAudio() + stream = await asyncio.to_thread( + pya.open, format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True + ) + logger.info("Audio playback stream opened successfully") + + while True: + bytestream = await self.audio_in_queue.get() + await asyncio.to_thread(stream.write, bytestream) + except Exception as e: + logger.error(f"Error in play_audio: {str(e)}") + logger.error(traceback.format_exc()) + + async def run(self): + logger.info("Starting AudioLoop.run()") + try: + async with ( + client.aio.live.connect(model=MODEL, config=CONFIG) as session, + asyncio.TaskGroup() as tg, + ): + self.session = session + logger.info("Session connected successfully") + + send_text_task = tg.create_task(self.send_text()) + + def cleanup(task): + logger.info("Cleaning up tasks...") + for t in tg._tasks: + t.cancel() + logger.info("Tasks cleanup complete") + + send_text_task.add_done_callback(cleanup) + + # Create all tasks + tasks = [ + tg.create_task(self.listen_audio()), + tg.create_task(self.send_audio()), + tg.create_task(self.get_frames()), + tg.create_task(self.send_frames()), + tg.create_task(self.receive_audio()), + tg.create_task(self.play_audio()) + ] + + def check_error(task): + if task.cancelled(): + logger.debug(f"Task {task.get_name()} was cancelled") + return + + if task.exception() is not None: + e = task.exception() + logger.error(f"Task {task.get_name()} failed with exception:") + logger.error(traceback.format_exception(None, e, e.__traceback__)) + sys.exit(1) + + for task in tg._tasks: + task.add_done_callback(check_error) + + except Exception as e: + logger.error(f"Error in run: {str(e)}") + logger.error(traceback.format_exc()) + +if __name__ == "__main__": + logger = setup_logging() + logger.info("Starting application...") + print("Application started, type 'q' to exit the app.") + try: + main = AudioLoop() + asyncio.run(main.run()) + except KeyboardInterrupt: + logger.info("Application stopped by user") + except Exception as e: + logger.error(f"Application error: {str(e)}") + logger.error(traceback.format_exc()) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..08cca02 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +annotated-types==0.7.0 +cachetools==5.5.0 +certifi==2024.8.30 +charset-normalizer==3.4.0 +google-auth==2.37.0 +google-genai==0.2.2 +idna==3.10 +numpy==2.2.0 +opencv-python==4.10.0.84 +pillow==10.4.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +PyAudio==0.2.14 +pydantic==2.10.3 +pydantic_core==2.27.1 +python-dotenv==1.0.1 +requests==2.32.3 +rsa==4.9 +typing_extensions==4.12.2 +urllib3==2.2.3 +websockets==14.1