/**
 *
 * @param htmlString Raw html string
 * @returns A HTML string that has empty tags removed.
 */
const cleanseHtml = (htmlString: string) => {
  // Remove empty <img/> tags
  const imgPattern = /<img[^>]*\/?>/g
  htmlString = htmlString.replace(imgPattern, '')

  // Remove empty <span> tags
  const spanPattern = /<span[^>]*>\s*<\/span>/gi
  htmlString = htmlString.replace(spanPattern, '')

  // Remove empty <p> tags
  const pPattern = /<p[^>]*>\s*<\/p>/g
  htmlString = htmlString.replace(pPattern, '')

  // collapse <br/> tags
  const brPattern = /(<br\s*\/?>\s*)+/g
  htmlString = htmlString.replace(brPattern, '')

  return htmlString
}

export const documentsMatchInString = (docContentString: string) => {
  const documentSeparatedSegments = cleanseHtml(docContentString).split('document: ')

  // document sectioned html
  if (documentSeparatedSegments.length > 1) {
    //   const contextList: { document?: string; text: string }[] = []
    return documentSeparatedSegments.reduce<Array<{ document: string; text: string }>>(
      (documentArr, currentDoc, index) => {
        // skip the "document:" part
        if (index === 0) return documentArr

        const documentSegment = currentDoc.trim()
        const textSegment = currentDoc.trim()

        const documentTitleRegex = /(.*?)(?=,\shtml content:)/g
        const docMatch = documentTitleRegex.exec(documentSegment)
        const textMatch = textSegment.match(/<html>[\s\S]*?<\/html>/)

        if (docMatch && textMatch) {
          const documentValue =
            docMatch[1]?.trim()?.replace(/^"/, '')?.replace(/"$/, '').replace(/\n/g, ' ').replace(/\.$/, '') || ''
          const textValue = textMatch[0] || ''
          documentArr.push({ document: documentValue, text: textValue })

          return documentArr
        }

        return documentArr
      },
      []
    )
    // non documented html
  } else {
    const textSegment = documentSeparatedSegments[0]?.trim()
    const textMatch = textSegment.match(/<html>[\s\S]*?<\/html>/)
    if (textMatch) {
      return [{ text: textMatch[0] || '' }]
    }

    return []
  }
}
