kefu/knowledge/utils/tools.go

159 lines
3.4 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package utils
import (
"baliance.com/gooxml/document"
"bytes"
"compress/gzip"
"context"
"github.com/google/go-tika/tika"
"github.com/ledongthuc/pdf"
"github.com/tealeg/xlsx"
"io"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"strings"
)
func ReadPdfAll2(path string) (string, error) {
f, err := os.Open(path)
defer f.Close()
if err != nil {
return "", err
}
client := tika.NewClient(nil, "http://127.0.0.1:9998")
res, err := client.Parse(context.TODO(), f)
return TrimHtml(res), err
}
//读取Excel全部数据
func ReadExcelAll(excelPath string) ([]string, error) {
// 打开 Excel 文件
xlFile, err := xlsx.OpenFile(excelPath)
if err != nil {
return nil, err
}
texts := make([]string, 0)
// 遍历每个 Sheet
for _, sheet := range xlFile.Sheets {
// 遍历每行数据
for _, row := range sheet.Rows {
line := ""
// 遍历每个单元格
for _, cell := range row.Cells {
// 输出单元格的值
line += cell.Value
}
if line == "" {
continue
}
texts = append(texts, line)
}
}
return texts, nil
}
func ReadDocxAll(fileName string) (string, error) {
doc, err := document.Open(fileName)
if err != nil {
return "", err
}
text := ""
for _, para := range doc.Paragraphs() {
//run为每个段落相同格式的文字组成的片段
for _, run := range para.Runs() {
text += run.Text()
}
}
return text, nil
}
//读取pdf文字内容
func ReadPdfAll(path string) (string, error) {
f, r, err := pdf.Open(path)
text := ""
// remember close file
defer f.Close()
if err != nil {
if err != nil {
text, err = ReadPdfAll2(path)
}
return text, err
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
if err != nil {
text, err = ReadPdfAll2(path)
}
return text, err
}
buf.ReadFrom(b)
return TrimHtml(buf.String()), nil
}
func HttpGet(url string) string {
res, err := http.Get(url)
if err != nil {
log.Println(err)
return ""
}
var reader io.ReadCloser
if res.Header.Get("Content-Encoding") == "gzip" {
reader, err = gzip.NewReader(res.Body)
if err != nil {
return ""
}
} else {
reader = res.Body
}
//utf8Reader := transform.NewReader(reader,
// simplifiedchinese.GBK.NewDecoder())
robots, err := ioutil.ReadAll(reader)
res.Body.Close()
if err != nil {
log.Println(err)
return ""
}
return string(robots)
}
/**
去除html标签过滤html标签
*/
func TrimHtml(src string) string {
//将HTML标签全转换成小写
re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
src = re.ReplaceAllStringFunc(src, strings.ToLower)
//去除STYLE
re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
src = re.ReplaceAllString(src, "")
//去除SCRIPT
re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
src = re.ReplaceAllString(src, "")
//去除所有尖括号内的HTML代码并换成换行符
re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
src = re.ReplaceAllString(src, "")
//去除连续的换行符
re, _ = regexp.Compile("\\s{2,}")
src = re.ReplaceAllString(src, "")
return strings.TrimSpace(src)
}
// 通过map主键唯一的特性过滤重复元素
func RemoveRepByMap(slc []string) []string {
result := []string{}
tempMap := map[string]byte{} // 存放不重复主键
for _, e := range slc {
l := len(tempMap)
tempMap[e] = 0
if len(tempMap) != l { // 加入map后map长度变化则元素不重复
result = append(result, e)
}
}
return result
}