Procházet zdrojové kódy

feat:添加公历日期转农历日期的udf

zhaohaipeng před 2 měsíci
rodič
revize
e664f7d551
2 změnil soubory, kde provedl 323 přidání a 0 odebrání
  1. 126 0
      script/fish_reference_audio_sync.py
  2. 197 0
      udf/solar_to_lunar.py

+ 126 - 0
script/fish_reference_audio_sync.py

@@ -0,0 +1,126 @@
+import logging
+from io import BytesIO
+
+import pandas as pd
+import requests
+
+logger = logging.getLogger(__name__)
+
+from typing import Dict, Any
+
+
+def build_common_header_map():
+    return {
+        "Authorization": "Bearer 0891f3f93a2640428f9988e267aa57e1",
+        "Content-Type": "application/json"
+    }
+
+
+def get_model_info(reference_id: str) -> Dict[str, Any]:
+    api_url = f"https://api.fish.audio/model/{reference_id}"
+    headers = build_common_header_map()
+    response = requests.get(
+        api_url,
+        headers=headers,
+        timeout=(10, 1800)  # connect timeout, read timeout
+    )
+    return response.json()
+
+
+def add_reference(
+        base_url: str,
+        reference_id: str,
+        audio_url: str,
+        text: str,
+        timeout: int = 30,
+        download_timeout: int = 60,
+) -> Dict[str, Any]:
+    """
+    从 URL 下载音频并调用添加参考音频的接口
+
+    Args:
+        base_url: 服务端基础地址,例如 "http://localhost:8000"
+        reference_id: 参考音频唯一标识
+        audio_url: 音频文件的 URL(支持 http/https)
+        text: 音频对应的文本内容
+        timeout: 上传请求的超时时间(秒)
+        download_timeout: 下载音频文件的超时时间(秒)
+
+    Returns:
+        服务端返回的 JSON 响应(字典)
+
+    Raises:
+        requests.RequestException: 下载或上传请求失败
+        ValueError: 服务端返回错误响应或下载内容为空
+    """
+    # 1. 从 URL 下载音频内容
+    try:
+        resp = requests.get(audio_url, timeout=download_timeout)
+        resp.raise_for_status()  # 检查 HTTP 错误
+        audio_content = resp.content
+        if not audio_content:
+            raise ValueError("从 URL 下载的音频文件为空")
+    except requests.exceptions.RequestException as e:
+        raise requests.RequestException(f"下载音频失败: {e}") from e
+
+    # 2. 构造请求 URL
+    url = f"{base_url.rstrip('/')}/v1/references/add"
+
+    # 3. 准备表单数据和文件(从内存中的字节构造文件)
+    data = {
+        "id": reference_id,
+        "text": text,
+    }
+    # 从 URL 中提取文件名(如果 URL 没有明确文件名,可以自定义)
+    file_name = audio_url.split('/')[-1] or "audio.wav"
+    # 使用 BytesIO 包装音频内容
+    files = {
+        "audio": (file_name, BytesIO(audio_content), "audio/wav"),
+    }
+
+    headers = {
+        "Accept": "application/json",
+    }
+    # 4. 发送上传请求
+    try:
+        response = requests.post(url, data=data, files=files, headers=headers, timeout=timeout)
+        resp_json = response.json()
+    except requests.exceptions.RequestException as e:
+        raise requests.RequestException(f"上传请求失败: {e}") from e
+    finally:
+        # 关闭 BytesIO(可选,因为内存对象会自动回收)
+        files["audio"][1].close()
+
+    # 5. 检查响应
+    if response.status_code != 200:
+        raise ValueError(f"服务端返回错误 (HTTP {response.status_code}): {resp_json.get('message', '未知错误')}")
+
+    if not resp_json.get("success", False):
+        raise ValueError(f"业务失败: {resp_json.get('message', '未知错误')}")
+
+    return resp_json
+
+
+
+def _main():
+    df = pd.read_csv("/Users/zhao/Desktop/aigc_admin_prod_ai_model_tts.csv")
+    base_url = "http://192.168.245.146:8080/"
+    for row in df.itertuples():
+        reference_id = row.speaker_id
+        audio_url = row.audio_url
+        if reference_id in ['6e2d9e58b26c424db6d564ea56983f4d']:
+            continue
+
+        model_info = get_model_info(reference_id)
+        text = model_info['samples'][0]['text']
+        add_reference(
+            base_url=base_url,
+            reference_id=reference_id,
+            audio_url=audio_url,
+            text=text,
+            timeout=30,
+        )
+
+
+if __name__ == '__main__':
+    _main()

+ 197 - 0
udf/solar_to_lunar.py

@@ -0,0 +1,197 @@
+# -*- coding: utf-8 -*-
+import datetime
+
+from odps.udf import annotate
+
+# 中文农历
+LUNAR_MONTH_NAME = [
+    "", "正月", "二月", "三月", "四月", "五月", "六月",
+    "七月", "八月", "九月", "十月", "冬月", "腊月"
+]
+
+LUNAR_DAY_NAME = [
+    "", "初一", "初二", "初三", "初四", "初五", "初六", "初七", "初八", "初九", "初十",
+    "十一", "十二", "十三", "十四", "十五", "十六", "十七", "十八", "十九", "二十",
+    "廿一", "廿二", "廿三", "廿四", "廿五", "廿六", "廿七", "廿八", "廿九", "三十"
+]
+
+
+def lunar_to_yyyy_mm_dd(ly, lm, ld):
+    return "{ly}{lm}{ld}".format(ly=ly, lm=lm, ld=ld)
+
+
+def lunar_to_yyyy_mm_dd_2(ly, lm, ld):
+    return "{ly}-{lm}-{ld}".format(ly=ly, lm=lm, ld=ld)
+
+
+def lunar_to_yyyy_mm_dd_cn(ly, lm, ld):
+    return "{ly}年{lm}{ld}日".format(ly=ly, lm=lm, ld=ld)
+
+
+FMT_STR_MAP = {
+    "yyyyMMdd": "%Y%m%d",
+    "yyyy-MM-dd": "%Y-%m-%d",
+    "yyyy年MM月dd日": "%Y年%m月%d日"
+}
+
+OUTPUT_FUNC_MAP = {
+    "yyyyMMdd": lunar_to_yyyy_mm_dd,
+    "yyyy-MM-dd": lunar_to_yyyy_mm_dd_2,
+    "yyyy年MM月dd日": lunar_to_yyyy_mm_dd_cn
+}
+
+# 农历数据(1900-2100)
+LUNAR_INFO = [
+    0x04bd8, 0x04ae0, 0x0a570, 0x054d5, 0x0d260, 0x0d950, 0x16554,
+    0x056a0, 0x09ad0, 0x055d2, 0x04ae0, 0x0a5b6, 0x0a4d0, 0x0d250,
+    0x1d255, 0x0b540, 0x0d6a0, 0x0ada2, 0x095b0, 0x14977, 0x04970,
+    0x0a4b0, 0x0b4b5, 0x06a50, 0x06d40, 0x1ab54, 0x02b60, 0x09570,
+    0x052f2, 0x04970, 0x06566, 0x0d4a0, 0x0ea50, 0x06e95, 0x05ad0,
+    0x02b60, 0x186e3, 0x092e0, 0x1c8d7, 0x0c950, 0x0d4a0, 0x1d8a6,
+    0x0b550, 0x056a0, 0x1a5b4, 0x025d0, 0x092d0, 0x0d2b2, 0x0a950,
+    0x0b557, 0x06ca0, 0x0b550, 0x15355, 0x04da0, 0x0a5b0, 0x14573,
+    0x052b0, 0x0a9a8, 0x0e950, 0x06aa0, 0x0aea6, 0x0ab50, 0x04b60,
+    0x0aae4, 0x0a570, 0x05260, 0x0f263, 0x0d950, 0x05b57, 0x056a0,
+    0x096d0, 0x04dd5, 0x04ad0, 0x0a4d0, 0x0d4d4, 0x0d250, 0x0d558,
+    0x0b540, 0x0b5a0, 0x195a6, 0x095b0, 0x049b0, 0x0a974, 0x0a4b0,
+    0x0b27a, 0x06a50, 0x06d40, 0x0af46, 0x0ab60, 0x09570, 0x04af5,
+    0x04970, 0x064b0, 0x074a3, 0x0ea50, 0x06b58, 0x05ac0, 0x0ab60,
+    0x096d5, 0x092e0, 0x0c960, 0x0d954, 0x0d4a0, 0x0da50, 0x07552,
+    0x056a0, 0x0abb7, 0x025d0, 0x092d0, 0x0cab5, 0x0a950, 0x0b4a0,
+    0x0baa4, 0x0ad50, 0x055d9, 0x04ba0, 0x0a5b0, 0x15176, 0x052b0,
+    0x0a930, 0x07954, 0x06aa0, 0x0ad50, 0x05b52, 0x04b60, 0x0a6e6,
+    0x0a4e0, 0x0d260, 0x0ea65, 0x0d530, 0x05aa0, 0x076a3, 0x096d0,
+    0x04bd7, 0x04ad0, 0x0a4d0, 0x1d0b6, 0x0d250, 0x0d520, 0x0dd45,
+    0x0b5a0, 0x056d0, 0x055b2, 0x049b0, 0x0a577, 0x0a4b0, 0x0aa50,
+    0x1b255, 0x06d20, 0x0ada0
+]
+
+
+def _leap_month(y):
+    return LUNAR_INFO[y - 1900] & 0xf
+
+
+def _leap_days(y):
+    if _leap_month(y):
+        return 30 if (LUNAR_INFO[y - 1900] & 0x10000) else 29
+    return 0
+
+
+def _month_days(y, m):
+    return 30 if (LUNAR_INFO[y - 1900] & (0x10000 >> m)) else 29
+
+
+def _year_days(y):
+    sum_days = 348
+    i = 0x8000
+    while i > 0x8:
+        if LUNAR_INFO[y - 1900] & i:
+            sum_days += 1
+        i >>= 1
+    return sum_days + _leap_days(y)
+
+
+def solar_to_lunar_core(y, m, d):
+    base = datetime.date(1900, 1, 31)
+    obj = datetime.date(y, m, d)
+    offset = (obj - base).days
+
+    year = 1900
+    while year < 2100 and offset > 0:
+        temp = _year_days(year)
+        offset -= temp
+        year += 1
+
+    if offset < 0:
+        offset += temp
+        year -= 1
+
+    leap = _leap_month(year)
+    is_leap = False
+
+    month = 1
+    while month <= 12 and offset > 0:
+        if leap > 0 and month == (leap + 1) and not is_leap:
+            month -= 1
+            is_leap = True
+            temp = _leap_days(year)
+        else:
+            temp = _month_days(year, month)
+
+        offset -= temp
+
+        if is_leap and month == (leap + 1):
+            is_leap = False
+
+        month += 1
+
+    if offset < 0:
+        offset += temp
+        month -= 1
+
+    day = offset + 1
+
+    return year, month, day, is_leap
+
+
+def _parse_input_date_str(date_str, fmt_str):
+    # 用 Python 标准库解析
+    py_fmt_str = FMT_STR_MAP.get(fmt_str, None)
+    if py_fmt_str is None:
+        return "ERROR: unsupported format {fmt_str}".format(fmt_str=fmt_str)
+
+    dt = datetime.datetime.strptime(date_str, py_fmt_str)
+    return dt.year, dt.month, dt.day
+
+
+def _year_to_cnt(year):
+    # 数字到汉字的映射(使用标准“〇”表示零)
+    digit_map = {
+        '0': '〇',
+        '1': '一',
+        '2': '二',
+        '3': '三',
+        '4': '四',
+        '5': '五',
+        '6': '六',
+        '7': '七',
+        '8': '八',
+        '9': '九'
+    }
+    # 将年份转为字符串,逐位转换并拼接
+    chinese_digits = ''.join(digit_map[ch] for ch in str(year))
+    return chinese_digits
+
+
+def _lunar_to_string(lunar_year, lunar_month, lunar_day, is_leap, fmt_str, output):
+    if output == 'cn':
+        lunar_year = _year_to_cnt(lunar_year)
+        lunar_month = LUNAR_MONTH_NAME[lunar_month]
+        lunar_day = LUNAR_DAY_NAME[lunar_day]
+    elif "MM" in fmt_str:
+        lunar_month = str(lunar_month).zfill(2)
+
+    ly = str(lunar_year)
+    lm = str(lunar_month) if not is_leap else "闰" + str(lunar_month)
+    ld = str(lunar_day)
+
+    return OUTPUT_FUNC_MAP.get(fmt_str, lunar_to_yyyy_mm_dd)(ly, lm, ld)
+
+
+@annotate("*->string")
+class solar_to_lunar(object):
+
+    def evaluate(self, date_str, fmt_str="yyyy-MM-dd", output=""):
+        if not date_str:
+            return None
+        try:
+            # 解析输入的日期
+            y, m, d = _parse_input_date_str(date_str, fmt_str)
+
+            # 农历转换
+            ly, lm, ld, is_leap = solar_to_lunar_core(y, m, d)
+
+            return _lunar_to_string(ly, lm, ld, is_leap, fmt_str, output)
+
+        except Exception as e:
+            return "ERROR: {type}: {e}".format(type=type(e).__name__, e=str(e))