hfbzthesis/chapters/process.tex

% !TeX root = ../main.tex

\lstdefinestyle{lfonts}{
  basicstyle   = \scriptsize\ttfamily,
  stringstyle  = \color{purple},
  keywordstyle = \color{blue!60!black}\bfseries,
  commentstyle = \color{olive}\scshape,
}
\lstdefinestyle{lnumbers}{
  numbers     = left,
  numberstyle = \tiny,
  numbersep   = 1em,
  firstnumber = 1,
  stepnumber  = 1,
}
\lstdefinestyle{llayout}{
  breaklines       = true,
  tabsize          = 2,
  % columns          = spacefixed,
}
\lstdefinestyle{lgeometry}{
  xleftmargin      = 20pt,
  xrightmargin     = 0pt,
  frame            = tb,
  framesep         = \fboxsep,
  framexleftmargin = 20pt,
}

\lstdefinestyle{lgeneral}{
  style = lfonts,
  style = lnumbers,
  style = llayout,
  style = lgeometry,
}

\lstdefinelanguage{JavaScript}{
  keywords={typeof, new, true, false, catch, function, return, null, catch, switch, var, if, in, while, do, else, case, break, import, const, from},
  keywordstyle=\color{blue}\bfseries,
  ndkeywords={class, export, boolean, throw, implements, import, this},
  ndkeywordstyle=\color{darkgray}\bfseries,
  identifierstyle=\color{black},
  sensitive=false,
  comment=[l]{//},
  morecomment=[s]{/*}{*/},
  commentstyle=\color{purple}\ttfamily,
  stringstyle=\color{red}\ttfamily,
  morestring=[b]',
  morestring=[b]"
}


\chapter{成语接龙算法流程}

\section{成语词库}

我们站在巨人的肩膀上，前人已经收集了足够的成语词库。成语词库来源于 \href{https://github.com/pwxcoo/chinese-xinhua}{chinese-xinhua}，此项目以 MIT 协议开源。

以下是其中一个成语的数据结构：

\lstset{language = JavaScript, style = lgeneral}

\begin{lstlisting}
{
  "derivation": "语出《法华经·法师功德品》下至阿鼻地狱。”",
  "example": "但也有少数意志薄弱的……逐步上当，终至堕入～。★《上饶集中营·炼狱杂记》",
  "explanation": "阿鼻梵语的译音，意译为无间”，即痛苦无有间断之意。常用来比喻黑暗的社会和严酷的牢狱。又比喻无法摆脱的极其痛苦的境地。",
  "pinyin": "ā bí dì yù",
  "word": "阿鼻地狱",
  "abbreviation": "abdy",
  "id": 0
},
\end{lstlisting}

经统计，共收录了 30895 个成语，但其中还包括了非四字成语。我们使用如下 JavaScript 代码剔除：

\lstset{language = JavaScript, style = lgeneral}

\begin{lstlisting}
import fs from 'node:fs'
import path from 'node:path'

import idioms from './idiom.json' assert { type: 'json' }

const new_idioms = idioms.filter(value => value.word.length === 4)

console.log(new_idioms.length)

const idioms_json = JSON.stringify(new_idioms)

fs.writeFile('./four_letter_idioms.json', idioms_json, (err) => {
  if (err)
    throw err
})
\end{lstlisting}

\section{后端}

\subsection{索引}
为便于查询成语，首先为每个成语以其首拼音和末拼音索引。

再采用递推法，以成语“一个顶俩”的“yi”为初始值，遍历末拼音为“yi”的成语，以第一次循环为“第一层”，为每个成语添加层级属性。

以下为核心代码：

\lstset{language = Go, style = lgeneral}

\begin{lstlisting}
func initIdiomsDB() {
  lastPinyins := mapset.NewSet[string]()

  lastPinyins.Add("yi") // 初始化，以“yi”为起点递推

  level := 1
  for lastPinyins.Cardinality() > 0 { // 直到无需要遍历的拼音结束循环
    firstPinyins := mapset.NewSet[string]()

    for pinyin := range lastPinyins.Iter() {
      err := func() error {
        txn := IdiomsDB.Txn(true)
        defer txn.Abort()

        result, err := txn.Get("lastPinyin", "pinyin", pinyin)
        if err != nil {
          return err
        }

        for item := result.Next(); item != nil; item = result.Next() {
          lastPinyin := item.(*models.LastPinyin)
          if lastPinyin != nil && level > 0 {
            if lastPinyin.Idiom.Level == 0 {
              lastPinyin.Idiom.Level = level
              err := txn.Insert("idioms", lastPinyin.Idiom)
              if err != nil {
                return err
              }
              firstPinyins.Add(utils.GetFirstPinyin(lastPinyin.Idiom.Pinyin))
            }
          }
        }
        txn.Commit()
        return nil
      }()
      if err != nil {
        log.Fatal(err)
      }
    }

    lastPinyins = firstPinyins // 以在下一次循环查询成语
    level++
  }
}
\end{lstlisting}

本算法实际上是递推思想的应用。算法前期对成语索引，为每个成语添加Level属性，以便于之后的正向查询成语接龙。

以下为流程图：

\begin{figure}[H]
  \centering
  \includegraphics[width=12cm]{index.pdf}
  \caption{}
  \label{fig:index}
\end{figure}

\subsection{遍历查询}

得到这些成语后再分别遍历以每个成语首拼音为末拼音的成语。

以下为核心代码：
\lstset{language = Go, style = lgeneral}

\begin{lstlisting}
func Solitaire(c echo.Context) error {
  var r *solitaireRequest
  err := c.Bind(&r)
  if err != nil {
    return c.String(http.StatusBadRequest, "")
  }
  idiom, err := db.GetIdiom(r.Word)
  if err != nil {
    return c.String(http.StatusBadRequest, "")
  }

  idioms := linkedhashset.New()

  for idiom != nil {
    idioms.Add(idiom)
    level := idiom.Level
    if level > 1 {
      pinyinIdioms, err := db.GetFirstPinyinIdioms(utils.GetLastPinyin(idiom.Pinyin))
      if err != nil {
        return err
      }
     filtered := utils.Filter(pinyinIdioms, func(i *models.Idiom) bool {
        return i.Level < idiom.Level
      }) // 检出比当前成语低Level的成语
      idiom = utils.RandomItem(filtered) // 成语可能不止一个，随机取一个
    } else if utils.GetLastPinyin(idiom.Pinyin) == "yi" {
      idioms.Add(&models.Idiom{ // 遍历结束，加一个顶俩
        Data: models.Data{
          Id:           29502,
          Derivation:   "无",
          Example:      "董同学在项目中起到了~的效果。",
          Explanation:  "形容一个人做事有效果，一个人能代替两个人。",
          Pinyin:       "yī gè dǐng liǎ",
          Word:         "一个顶俩",
          Abbreviation: "无",
        },
      })
      break
    } else {
      return c.String(http.StatusNoContent, "") // 找不到成语
    }
  }

  return c.JSON(http.StatusOK, idioms) // 成功，输出
}
\end{lstlisting}

经过测试，循环遍历5次后需要遍历的成语列表为空，即最多4个成语就能到达“一个顶俩”这一成语。

每个成语的Level属性接龙到成语“一个顶俩”的“距离”。

在得到目标成语后，只需查询首拼音与该成语末拼音相同的成语中比该成语层级属性更低的成语，就能保证查询的方向是在步步逼近成语“一个顶俩”。