kefu/tools/doc.go

91 lines
1.7 KiB
Go
Raw Permalink Normal View History

2024-12-10 02:50:12 +00:00
package tools
import (
"baliance.com/gooxml/document"
"bytes"
"context"
"github.com/google/go-tika/tika"
"github.com/ledongthuc/pdf"
"github.com/tealeg/xlsx"
"os"
)
func ReadPdfAll2(path string) (string, error) {
f, err := os.Open(path)
defer f.Close()
if err != nil {
return "", err
}
client := tika.NewClient(nil, "http://127.0.0.1:9998")
res, err := client.Parse(context.TODO(), f)
return TrimHtml(res), err
}
//读取Excel全部数据
func ReadExcelAll(excelPath string) ([]string, error) {
// 打开 Excel 文件
xlFile, err := xlsx.OpenFile(excelPath)
if err != nil {
return nil, err
}
texts := make([]string, 0)
// 遍历每个 Sheet
for _, sheet := range xlFile.Sheets {
// 遍历每行数据
for _, row := range sheet.Rows {
line := ""
// 遍历每个单元格
for _, cell := range row.Cells {
// 输出单元格的值
line += cell.Value
}
if line == "" {
continue
}
texts = append(texts, line)
}
}
return texts, nil
}
func ReadDocxAll(fileName string) (string, error) {
doc, err := document.Open(fileName)
if err != nil {
return "", err
}
text := ""
for _, para := range doc.Paragraphs() {
//run为每个段落相同格式的文字组成的片段
for _, run := range para.Runs() {
text += run.Text()
}
}
return text, nil
}
//读取pdf文字内容
func ReadPdfAll(path string) (string, error) {
f, r, err := pdf.Open(path)
text := ""
// remember close file
defer f.Close()
if err != nil {
if err != nil {
text, err = ReadPdfAll2(path)
}
return text, err
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
if err != nil {
text, err = ReadPdfAll2(path)
}
return text, err
}
buf.ReadFrom(b)
return TrimHtml(buf.String()), nil
}