jsk-ros-pkg · iory · Jun 11, 2022 · Jun 11, 2022 · Jun 11, 2022 · Jun 11, 2022
diff --git a/respeaker_ros/CMakeLists.txt b/respeaker_ros/CMakeLists.txt
@@ -18,5 +18,7 @@ catkin_install_python(PROGRAMS ${PYTHON_SCRIPTS}
 
 if(CATKIN_ENABLE_TESTING)
   find_package(rostest REQUIRED)
+  find_package(roslaunch REQUIRED)
   add_rostest(test/sample_respeaker.test)
+  roslaunch_add_file_check(launch/sample_respeaker.launch)
 endif()
diff --git a/respeaker_ros/README.md b/respeaker_ros/README.md
@@ -87,6 +87,151 @@ A ROS Package for Respeaker Mic Array
     a: 0.3"
     ```
 
+## Parameters for respeaker_node.py
+
+  - ### Publishing topics
+
+    - `audio` (`audio_common_msgs/AudioData`)
+
+      Processed audio for ASR. 1 channel.
+
+    - `audio_info` (`audio_common_msgs/AudioInfo`)
+
+      Audio info with respect to `~audio`.
+
+    - `audio_raw` (`audio_common_msgs/AudioData`)
+
+      Micarray audio data has 4-channels. Maybe you need to update respeaker firmware.
+
+      If the firmware isn't supported, this will not be output.
+
+    - `audio_info_raw` (`audio_common_msgs/AudioInfo`)
+
+      Audio info with respect to `~audio_raw`.
+
+      If the firmware isn't supported, this will not be output.
+
+    - `speech_audio` (`audio_common_msgs/AudioData`)
+
+      Audio data while a person is speaking using the VAD function.
+
+    - `speech_audio_raw` (`audio_common_msgs/AudioData`)
+
+      Audio data has 4-channels while a person is speaking using the VAD function.
+
+      If the firmware isn't supported, this will not be output.
+
+    - `audio_merged_playback` (`audio_common_msgs/AudioData`)
+
+      Data that combines the sound of mic and speaker.
+
+      If the firmware isn't supported, this will not be output.
+
+      For more detail, please see https://wiki.seeedstudio.com/ReSpeaker_Mic_Array_v2.0/
+
+    - `~is_speeching` (`std_msgs/Bool`)
+
+      Using VAD function, publish whether someone is speaking.
+
+    - `~sound_direction` (`std_msgs/Int32`)
+
+      Direction of sound.
+
+    - `~sound_localization` (`geometry_msgs/PoseStamped`)
+
+      Localized Sound Direction. The value of the position in the estimated direction with `~doa_offset` as the radius is obtained.
+
+  - ### Parameters
+
+    - `~update_rate` (`Double`, default: `10.0`)
+
+      Publishing info data such as `~is_speeching`, `~sound_direction`, `~sound_localization`, `~speech_audio` and `~speech_audio_raw`.
+
+    - `~sensor_frame_id` (`String`, default: `respeaker_base`)
+
+      Frame id.
+
+    - `~doa_xy_offset` (`Double`, default: `0.0`)
+
+      `~doa_offset` is a estimated sound direction's radius.
+
+    - `~doa_yaw_offset` (`Double`, default: `90.0`)
+
+      Estimated DoA angle offset.
+
+    - `~speech_prefetch` (`Double`, default: `0.5`)
+
+      Time to represent how long speech is pre-stored in buffer.
+
+    - `~speech_continuation` (`Double`, default: `0.5`)
+
+      If the time between the current time and the time when the speech is stopped is shorter than this time,
+      it is assumed that someone is speaking.
+
+    - `~speech_max_duration` (`Double`, default: `7.0`)
+
+    - `~speech_min_duration` (`Double`, default: `0.1`)
+
+       If the speaking interval is within these times, `~speech_audio` and `~speech_audio_raw` will be published.
+
+    - `~suppress_pyaudio_error` (`Bool`, default: `True`)
+
+      If this value is `True`, suppress error from pyaudio.
+
+## Parameters for speech_to_text.py
+
+  - ### Publishing topics
+
+    - `~speech_to_text` (`speech_recognition_msgs/SpeechRecognitionCandidates`)
+
+      Recognized text.
+
+  - ### Subscribing topics
+
+    - `audio` (`audio_common_msgs/AudioData`)
+
+      Input audio.
+
+  - ### Parameters
+
+    - `~audio_info` (`String`, default: ``)
+
+      audio_info (`audio_common_msgs/AudioInfo`) topic. If this value is specified, `~sample_rate`, `~sample_width` and `~channels` parameters are obtained from the topic.
+
+    - `~sample_rate` (`Int`, default: `16000`)
+
+      Sampling rate.
+
+    - `~sample_width` (`Int`, default: `2`)
+
+      Sample with.
+
+    - `~channels` (`Int`, default: `1`)
+
+      Number of channels.
+
+    - `~target_channel` (`Int`, default: `0`)
+
+      Target number of channel.
+
+    - `~language` (`String`, default: `ja-JP`)
+
+      language of speech to text service. For English users, you can specify `en-US`.
+
+    - `~self_cancellation` (`Bool`, default: `True`)
+
+      ignore voice input while the robot is speaking.
+
+    - `~tts_tolerance` (`String`, default: `1.0`)
+
+      time to assume as SPEAKING after tts service is finished.
+
+    - `~tts_action_names` (`List[String]`, default: `['sound_play']`)
+
+       If `~self_chancellation` is `True`, this value will be used.
+
+       When the actions are active, do nothing with the callback that subscribes to `audio`.
+
 ## Use cases
 
 ### Voice Recognition

diff --git a/respeaker_ros/launch/sample_respeaker.launch b/respeaker_ros/launch/sample_respeaker.launch
@@ -13,14 +13,17 @@
   <arg name="language" default="en-US"/>
   <!-- self cancellation -->
   <arg name="self_cancellation" default="true"/>
+  <!-- audio info topic name -->
+  <arg name="audio_info" default="audio_info"/>
 
   <node if="$(arg publish_tf)"
         name="static_transformer" pkg="tf" type="static_transform_publisher"
         args="0 0 0 0 0 0 1 map respeaker_base 100"/>
 
   <node if="$(arg launch_respeaker)"
         name="respeaker_node" pkg="respeaker_ros" type="respeaker_node.py"
-        respawn="true" respawn_delay="10" />
+        respawn="true" respawn_delay="10" >
+  </node>
 
   <node if="$(arg launch_soundplay)"
         name="sound_play" pkg="sound_play" type="soundplay_node.py"/>
@@ -30,6 +33,7 @@
     <remap from="audio" to="$(arg audio)"/>
     <remap from="speech_to_text" to="$(arg speech_to_text)"/>
     <rosparam subst_value="true">
+      audio_info: $(arg audio_info)
       language: $(arg language)
       self_cancellation: $(arg self_cancellation)
       tts_tolerance: 0.5

diff --git a/respeaker_ros/package.xml b/respeaker_ros/package.xml
@@ -15,6 +15,7 @@
   <exec_depend>flac</exec_depend>
   <exec_depend>geometry_msgs</exec_depend>
   <exec_depend>std_msgs</exec_depend>
+  <exec_depend>sound_play</exec_depend>
   <exec_depend>speech_recognition_msgs</exec_depend>
   <exec_depend>tf</exec_depend>
   <exec_depend condition="$ROS_PYTHON_VERSION == 2">python-numpy</exec_depend>

diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py
@@ -16,6 +16,7 @@
 import sys
 import time
 from audio_common_msgs.msg import AudioData
+from audio_common_msgs.msg import AudioInfo
 from geometry_msgs.msg import PoseStamped
 from std_msgs.msg import Bool, Int32, ColorRGBA
 from dynamic_reconfigure.server import Server
@@ -254,7 +255,6 @@ def __init__(self, on_audio, channel=0, suppress_error=True):
         if self.channels != 6:
             rospy.logwarn("%d channel is found for respeaker" % self.channels)
             rospy.logwarn("You may have to update firmware.")
-        self.channel = min(self.channels - 1, max(0, self.channel))
 
         self.stream = self.pyaudio.open(
             input=True, start=False,
@@ -284,9 +284,8 @@ def stream_callback(self, in_data, frame_count, time_info, status):
         data = np.frombuffer(in_data, dtype=np.int16)
         chunk_per_channel = int(len(data) / self.channels)
         data = np.reshape(data, (chunk_per_channel, self.channels))
-        chan_data = data[:, self.channel]
         # invoke callback
-        self.on_audio(chan_data.tobytes())
+        self.on_audio(data)
         return None, pyaudio.paContinue
 
     def start(self):
@@ -322,21 +321,80 @@ def __init__(self):
         self.pub_doa_raw = rospy.Publisher("sound_direction", Int32, queue_size=1, latch=True)
         self.pub_doa = rospy.Publisher("sound_localization", PoseStamped, queue_size=1, latch=True)
         self.pub_audio = rospy.Publisher("audio", AudioData, queue_size=10)
+        self.pub_audio_info = rospy.Publisher("audio_info", AudioInfo,
+                                              queue_size=1, latch=True)
+        self.pub_audio_raw_info = rospy.Publisher("audio_info_raw", AudioInfo,
+                                                  queue_size=1, latch=True)
         self.pub_speech_audio = rospy.Publisher("speech_audio", AudioData, queue_size=10)
         # init config
         self.config = None
         self.dyn_srv = Server(RespeakerConfig, self.on_config)
         # start
         self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error)
+        self.n_channel = self.respeaker_audio.channels
+
         self.speech_prefetch_bytes = int(
-            self.speech_prefetch * self.respeaker_audio.rate * self.respeaker_audio.bitdepth / 8.0)
+            1
+            * self.speech_prefetch
+            * self.respeaker_audio.rate
+            * self.respeaker_audio.bitdepth / 8.0)
         self.speech_prefetch_buffer = b""
         self.respeaker_audio.start()
         self.info_timer = rospy.Timer(rospy.Duration(1.0 / self.update_rate),
                                       self.on_timer)
         self.timer_led = None
         self.sub_led = rospy.Subscriber("status_led", ColorRGBA, self.on_status_led)
 
+        # processed audio for ASR
+        info_msg = AudioInfo(
+            channels=1,
+            sample_rate=self.respeaker_audio.rate,
+            sample_format='S16LE',
+            bitrate=self.respeaker_audio.rate * self.respeaker_audio.bitdepth,
+            coding_format='WAVE')
+        self.pub_audio_info.publish(info_msg)
+
+        if self.n_channel > 1:
+            # The respeaker has 4 microphones.
+            # Multiple microphones can be used for
+            # beam forming (strengthening the sound in a specific direction)
+            # and sound localization (the respeaker outputs the azimuth
+            # direction, but the multichannel can estimate
+            # the elevation direction). etc.
+
+            # Channel 0: processed audio for ASR
+            # Channel 1: mic1 raw data
+            # Channel 2: mic2 raw data
+            # Channel 3: mic3 raw data
+            # Channel 4: mic4 raw data
+            # Channel 5: merged playback
+            # For more detail, please see
+            # https://wiki.seeedstudio.com/ReSpeaker_Mic_Array_v2.0/
+            # (self.n_channel - 2) = 4 channels are multiple microphones.
+            self.pub_audio_raw = rospy.Publisher("audio_raw", AudioData,
+                                                 queue_size=10)
+            self.pub_audio_merged_playback = rospy.Publisher(
+                "audio_merged_playback", AudioData,
+                queue_size=10)
+            info_raw_msg = AudioInfo(
+                channels=self.n_channel - 2,
+                sample_rate=self.respeaker_audio.rate,
+                sample_format='S16LE',
+                bitrate=(self.respeaker_audio.rate *
+                         self.respeaker_audio.bitdepth),
+                coding_format='WAVE')
+            self.pub_audio_raw_info.publish(info_raw_msg)
+
+            self.speech_audio_raw_buffer = b""
+            self.speech_raw_prefetch_buffer = b""
+            self.pub_speech_audio_raw = rospy.Publisher(
+                "speech_audio_raw", AudioData, queue_size=10)
+            self.speech_raw_prefetch_bytes = int(
+                self.n_channel - 2
+                * self.speech_prefetch
+                * self.respeaker_audio.rate
+                * self.respeaker_audio.bitdepth / 8.0)
+
     def on_shutdown(self):
         try:
             self.respeaker.close()
@@ -374,14 +432,30 @@ def on_status_led(self, msg):
                                        oneshot=True)
 
     def on_audio(self, data):
-        self.pub_audio.publish(AudioData(data=data))
+        # take processed audio for ASR.
+        processed_data = data[:, 0].tobytes()
+        self.pub_audio.publish(AudioData(data=processed_data))
+        if self.n_channel > 1:
+            raw_audio_data = data[:, 1:5].reshape(-1).tobytes()
+            self.pub_audio_raw.publish(
+                AudioData(data=raw_audio_data))
+            self.pub_audio_merged_playback.publish(
+                AudioData(data=data[:, 5].tobytes()))
         if self.is_speeching:
             if len(self.speech_audio_buffer) == 0:
                 self.speech_audio_buffer = self.speech_prefetch_buffer
-            self.speech_audio_buffer += data
+                if self.n_channel > 1:
+                    self.speech_audio_raw_buffer = self.speech_raw_prefetch_buffer
+            self.speech_audio_buffer += processed_data
+            if self.n_channel > 1:
+                self.speech_audio_raw_buffer += raw_audio_data
         else:
-            self.speech_prefetch_buffer += data
+            self.speech_prefetch_buffer += processed_data
             self.speech_prefetch_buffer = self.speech_prefetch_buffer[-self.speech_prefetch_bytes:]
+            if self.n_channel > 1:
+                self.speech_raw_prefetch_buffer += raw_audio_data
+                self.speech_raw_prefetch_buffer = self.speech_raw_prefetch_buffer[
+                    -self.speech_raw_prefetch_bytes:]
 
     def on_timer(self, event):
         stamp = event.current_real or rospy.Time.now()
@@ -421,13 +495,15 @@ def on_timer(self, event):
         elif self.is_speeching:
             buf = self.speech_audio_buffer
             self.speech_audio_buffer = b""
+            buf_raw = self.speech_audio_raw_buffer
+            self.speech_audio_raw_buffer = b""
             self.is_speeching = False
             duration = 8.0 * len(buf) * self.respeaker_audio.bitwidth
-            duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth
+            duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth / self.n_channel
             rospy.loginfo("Speech detected for %.3f seconds" % duration)
             if self.speech_min_duration <= duration < self.speech_max_duration:
-
                 self.pub_speech_audio.publish(AudioData(data=buf))
+                self.pub_speech_audio_raw.publish(AudioData(data=buf_raw))
 
 
 if __name__ == '__main__':