91 lines
1.7 KiB
Go
91 lines
1.7 KiB
Go
|
package tools
|
||
|
|
||
|
import (
|
||
|
"baliance.com/gooxml/document"
|
||
|
"bytes"
|
||
|
"context"
|
||
|
"github.com/google/go-tika/tika"
|
||
|
"github.com/ledongthuc/pdf"
|
||
|
"github.com/tealeg/xlsx"
|
||
|
"os"
|
||
|
)
|
||
|
|
||
|
func ReadPdfAll2(path string) (string, error) {
|
||
|
f, err := os.Open(path)
|
||
|
defer f.Close()
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
|
||
|
client := tika.NewClient(nil, "http://127.0.0.1:9998")
|
||
|
res, err := client.Parse(context.TODO(), f)
|
||
|
return TrimHtml(res), err
|
||
|
}
|
||
|
|
||
|
//读取Excel全部数据
|
||
|
func ReadExcelAll(excelPath string) ([]string, error) {
|
||
|
// 打开 Excel 文件
|
||
|
xlFile, err := xlsx.OpenFile(excelPath)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
texts := make([]string, 0)
|
||
|
// 遍历每个 Sheet
|
||
|
for _, sheet := range xlFile.Sheets {
|
||
|
// 遍历每行数据
|
||
|
for _, row := range sheet.Rows {
|
||
|
line := ""
|
||
|
// 遍历每个单元格
|
||
|
for _, cell := range row.Cells {
|
||
|
// 输出单元格的值
|
||
|
line += cell.Value
|
||
|
}
|
||
|
if line == "" {
|
||
|
continue
|
||
|
}
|
||
|
texts = append(texts, line)
|
||
|
}
|
||
|
|
||
|
}
|
||
|
return texts, nil
|
||
|
}
|
||
|
func ReadDocxAll(fileName string) (string, error) {
|
||
|
doc, err := document.Open(fileName)
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
text := ""
|
||
|
|
||
|
for _, para := range doc.Paragraphs() {
|
||
|
//run为每个段落相同格式的文字组成的片段
|
||
|
for _, run := range para.Runs() {
|
||
|
text += run.Text()
|
||
|
}
|
||
|
}
|
||
|
return text, nil
|
||
|
}
|
||
|
|
||
|
//读取pdf文字内容
|
||
|
func ReadPdfAll(path string) (string, error) {
|
||
|
f, r, err := pdf.Open(path)
|
||
|
text := ""
|
||
|
// remember close file
|
||
|
defer f.Close()
|
||
|
if err != nil {
|
||
|
if err != nil {
|
||
|
text, err = ReadPdfAll2(path)
|
||
|
}
|
||
|
return text, err
|
||
|
}
|
||
|
var buf bytes.Buffer
|
||
|
b, err := r.GetPlainText()
|
||
|
if err != nil {
|
||
|
if err != nil {
|
||
|
text, err = ReadPdfAll2(path)
|
||
|
}
|
||
|
return text, err
|
||
|
}
|
||
|
buf.ReadFrom(b)
|
||
|
return TrimHtml(buf.String()), nil
|
||
|
}
|