kefu/knowledge/utils/tools.go

159 lines
3.4 KiB
Go
Raw Normal View History

2024-12-10 02:50:12 +00:00
package utils
import (
"baliance.com/gooxml/document"
"bytes"
"compress/gzip"
"context"
"github.com/google/go-tika/tika"
"github.com/ledongthuc/pdf"
"github.com/tealeg/xlsx"
"io"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"strings"
)
func ReadPdfAll2(path string) (string, error) {
f, err := os.Open(path)
defer f.Close()
if err != nil {
return "", err
}
client := tika.NewClient(nil, "http://127.0.0.1:9998")
res, err := client.Parse(context.TODO(), f)
return TrimHtml(res), err
}
//读取Excel全部数据
func ReadExcelAll(excelPath string) ([]string, error) {
// 打开 Excel 文件
xlFile, err := xlsx.OpenFile(excelPath)
if err != nil {
return nil, err
}
texts := make([]string, 0)
// 遍历每个 Sheet
for _, sheet := range xlFile.Sheets {
// 遍历每行数据
for _, row := range sheet.Rows {
line := ""
// 遍历每个单元格
for _, cell := range row.Cells {
// 输出单元格的值
line += cell.Value
}
if line == "" {
continue
}
texts = append(texts, line)
}
}
return texts, nil
}
func ReadDocxAll(fileName string) (string, error) {
doc, err := document.Open(fileName)
if err != nil {
return "", err
}
text := ""
for _, para := range doc.Paragraphs() {
//run为每个段落相同格式的文字组成的片段
for _, run := range para.Runs() {
text += run.Text()
}
}
return text, nil
}
//读取pdf文字内容
func ReadPdfAll(path string) (string, error) {
f, r, err := pdf.Open(path)
text := ""
// remember close file
defer f.Close()
if err != nil {
if err != nil {
text, err = ReadPdfAll2(path)
}
return text, err
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
if err != nil {
text, err = ReadPdfAll2(path)
}
return text, err
}
buf.ReadFrom(b)
return TrimHtml(buf.String()), nil
}
func HttpGet(url string) string {
res, err := http.Get(url)
if err != nil {
log.Println(err)
return ""
}
var reader io.ReadCloser
if res.Header.Get("Content-Encoding") == "gzip" {
reader, err = gzip.NewReader(res.Body)
if err != nil {
return ""
}
} else {
reader = res.Body
}
//utf8Reader := transform.NewReader(reader,
// simplifiedchinese.GBK.NewDecoder())
robots, err := ioutil.ReadAll(reader)
res.Body.Close()
if err != nil {
log.Println(err)
return ""
}
return string(robots)
}
/**
去除html标签过滤html标签
*/
func TrimHtml(src string) string {
//将HTML标签全转换成小写
re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
src = re.ReplaceAllStringFunc(src, strings.ToLower)
//去除STYLE
re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
src = re.ReplaceAllString(src, "")
//去除SCRIPT
re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
src = re.ReplaceAllString(src, "")
//去除所有尖括号内的HTML代码并换成换行符
re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
src = re.ReplaceAllString(src, "")
//去除连续的换行符
re, _ = regexp.Compile("\\s{2,}")
src = re.ReplaceAllString(src, "")
return strings.TrimSpace(src)
}
// 通过map主键唯一的特性过滤重复元素
func RemoveRepByMap(slc []string) []string {
result := []string{}
tempMap := map[string]byte{} // 存放不重复主键
for _, e := range slc {
l := len(tempMap)
tempMap[e] = 0
if len(tempMap) != l { // 加入map后map长度变化则元素不重复
result = append(result, e)
}
}
return result
}