159 lines
3.4 KiB
Go
159 lines
3.4 KiB
Go
|
package utils
|
|||
|
|
|||
|
import (
|
|||
|
"baliance.com/gooxml/document"
|
|||
|
"bytes"
|
|||
|
"compress/gzip"
|
|||
|
"context"
|
|||
|
"github.com/google/go-tika/tika"
|
|||
|
"github.com/ledongthuc/pdf"
|
|||
|
"github.com/tealeg/xlsx"
|
|||
|
"io"
|
|||
|
"io/ioutil"
|
|||
|
"log"
|
|||
|
"net/http"
|
|||
|
"os"
|
|||
|
"regexp"
|
|||
|
"strings"
|
|||
|
)
|
|||
|
|
|||
|
func ReadPdfAll2(path string) (string, error) {
|
|||
|
f, err := os.Open(path)
|
|||
|
defer f.Close()
|
|||
|
if err != nil {
|
|||
|
return "", err
|
|||
|
}
|
|||
|
|
|||
|
client := tika.NewClient(nil, "http://127.0.0.1:9998")
|
|||
|
res, err := client.Parse(context.TODO(), f)
|
|||
|
return TrimHtml(res), err
|
|||
|
}
|
|||
|
|
|||
|
//读取Excel全部数据
|
|||
|
func ReadExcelAll(excelPath string) ([]string, error) {
|
|||
|
// 打开 Excel 文件
|
|||
|
xlFile, err := xlsx.OpenFile(excelPath)
|
|||
|
if err != nil {
|
|||
|
return nil, err
|
|||
|
}
|
|||
|
texts := make([]string, 0)
|
|||
|
// 遍历每个 Sheet
|
|||
|
for _, sheet := range xlFile.Sheets {
|
|||
|
// 遍历每行数据
|
|||
|
for _, row := range sheet.Rows {
|
|||
|
line := ""
|
|||
|
// 遍历每个单元格
|
|||
|
for _, cell := range row.Cells {
|
|||
|
// 输出单元格的值
|
|||
|
line += cell.Value
|
|||
|
}
|
|||
|
if line == "" {
|
|||
|
continue
|
|||
|
}
|
|||
|
texts = append(texts, line)
|
|||
|
}
|
|||
|
|
|||
|
}
|
|||
|
return texts, nil
|
|||
|
}
|
|||
|
func ReadDocxAll(fileName string) (string, error) {
|
|||
|
doc, err := document.Open(fileName)
|
|||
|
if err != nil {
|
|||
|
return "", err
|
|||
|
}
|
|||
|
text := ""
|
|||
|
|
|||
|
for _, para := range doc.Paragraphs() {
|
|||
|
//run为每个段落相同格式的文字组成的片段
|
|||
|
for _, run := range para.Runs() {
|
|||
|
text += run.Text()
|
|||
|
}
|
|||
|
}
|
|||
|
return text, nil
|
|||
|
}
|
|||
|
|
|||
|
//读取pdf文字内容
|
|||
|
func ReadPdfAll(path string) (string, error) {
|
|||
|
f, r, err := pdf.Open(path)
|
|||
|
text := ""
|
|||
|
// remember close file
|
|||
|
defer f.Close()
|
|||
|
if err != nil {
|
|||
|
if err != nil {
|
|||
|
text, err = ReadPdfAll2(path)
|
|||
|
}
|
|||
|
return text, err
|
|||
|
}
|
|||
|
var buf bytes.Buffer
|
|||
|
b, err := r.GetPlainText()
|
|||
|
if err != nil {
|
|||
|
if err != nil {
|
|||
|
text, err = ReadPdfAll2(path)
|
|||
|
}
|
|||
|
return text, err
|
|||
|
}
|
|||
|
buf.ReadFrom(b)
|
|||
|
return TrimHtml(buf.String()), nil
|
|||
|
}
|
|||
|
func HttpGet(url string) string {
|
|||
|
res, err := http.Get(url)
|
|||
|
if err != nil {
|
|||
|
log.Println(err)
|
|||
|
return ""
|
|||
|
}
|
|||
|
var reader io.ReadCloser
|
|||
|
if res.Header.Get("Content-Encoding") == "gzip" {
|
|||
|
reader, err = gzip.NewReader(res.Body)
|
|||
|
if err != nil {
|
|||
|
return ""
|
|||
|
}
|
|||
|
} else {
|
|||
|
reader = res.Body
|
|||
|
}
|
|||
|
//utf8Reader := transform.NewReader(reader,
|
|||
|
// simplifiedchinese.GBK.NewDecoder())
|
|||
|
robots, err := ioutil.ReadAll(reader)
|
|||
|
res.Body.Close()
|
|||
|
if err != nil {
|
|||
|
log.Println(err)
|
|||
|
return ""
|
|||
|
}
|
|||
|
return string(robots)
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
去除html标签,过滤html标签
|
|||
|
*/
|
|||
|
func TrimHtml(src string) string {
|
|||
|
//将HTML标签全转换成小写
|
|||
|
re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
|
|||
|
src = re.ReplaceAllStringFunc(src, strings.ToLower)
|
|||
|
//去除STYLE
|
|||
|
re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
|
|||
|
src = re.ReplaceAllString(src, "")
|
|||
|
//去除SCRIPT
|
|||
|
re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
|
|||
|
src = re.ReplaceAllString(src, "")
|
|||
|
//去除所有尖括号内的HTML代码,并换成换行符
|
|||
|
re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
|
|||
|
src = re.ReplaceAllString(src, "")
|
|||
|
//去除连续的换行符
|
|||
|
re, _ = regexp.Compile("\\s{2,}")
|
|||
|
src = re.ReplaceAllString(src, "")
|
|||
|
return strings.TrimSpace(src)
|
|||
|
}
|
|||
|
|
|||
|
// 通过map主键唯一的特性过滤重复元素
|
|||
|
func RemoveRepByMap(slc []string) []string {
|
|||
|
result := []string{}
|
|||
|
tempMap := map[string]byte{} // 存放不重复主键
|
|||
|
for _, e := range slc {
|
|||
|
l := len(tempMap)
|
|||
|
tempMap[e] = 0
|
|||
|
if len(tempMap) != l { // 加入map后,map长度变化,则元素不重复
|
|||
|
result = append(result, e)
|
|||
|
}
|
|||
|
}
|
|||
|
return result
|
|||
|
}
|