159 lines
3.4 KiB
Go
159 lines
3.4 KiB
Go
package utils
|
||
|
||
import (
|
||
"baliance.com/gooxml/document"
|
||
"bytes"
|
||
"compress/gzip"
|
||
"context"
|
||
"github.com/google/go-tika/tika"
|
||
"github.com/ledongthuc/pdf"
|
||
"github.com/tealeg/xlsx"
|
||
"io"
|
||
"io/ioutil"
|
||
"log"
|
||
"net/http"
|
||
"os"
|
||
"regexp"
|
||
"strings"
|
||
)
|
||
|
||
func ReadPdfAll2(path string) (string, error) {
|
||
f, err := os.Open(path)
|
||
defer f.Close()
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
client := tika.NewClient(nil, "http://127.0.0.1:9998")
|
||
res, err := client.Parse(context.TODO(), f)
|
||
return TrimHtml(res), err
|
||
}
|
||
|
||
//读取Excel全部数据
|
||
func ReadExcelAll(excelPath string) ([]string, error) {
|
||
// 打开 Excel 文件
|
||
xlFile, err := xlsx.OpenFile(excelPath)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
texts := make([]string, 0)
|
||
// 遍历每个 Sheet
|
||
for _, sheet := range xlFile.Sheets {
|
||
// 遍历每行数据
|
||
for _, row := range sheet.Rows {
|
||
line := ""
|
||
// 遍历每个单元格
|
||
for _, cell := range row.Cells {
|
||
// 输出单元格的值
|
||
line += cell.Value
|
||
}
|
||
if line == "" {
|
||
continue
|
||
}
|
||
texts = append(texts, line)
|
||
}
|
||
|
||
}
|
||
return texts, nil
|
||
}
|
||
func ReadDocxAll(fileName string) (string, error) {
|
||
doc, err := document.Open(fileName)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
text := ""
|
||
|
||
for _, para := range doc.Paragraphs() {
|
||
//run为每个段落相同格式的文字组成的片段
|
||
for _, run := range para.Runs() {
|
||
text += run.Text()
|
||
}
|
||
}
|
||
return text, nil
|
||
}
|
||
|
||
//读取pdf文字内容
|
||
func ReadPdfAll(path string) (string, error) {
|
||
f, r, err := pdf.Open(path)
|
||
text := ""
|
||
// remember close file
|
||
defer f.Close()
|
||
if err != nil {
|
||
if err != nil {
|
||
text, err = ReadPdfAll2(path)
|
||
}
|
||
return text, err
|
||
}
|
||
var buf bytes.Buffer
|
||
b, err := r.GetPlainText()
|
||
if err != nil {
|
||
if err != nil {
|
||
text, err = ReadPdfAll2(path)
|
||
}
|
||
return text, err
|
||
}
|
||
buf.ReadFrom(b)
|
||
return TrimHtml(buf.String()), nil
|
||
}
|
||
func HttpGet(url string) string {
|
||
res, err := http.Get(url)
|
||
if err != nil {
|
||
log.Println(err)
|
||
return ""
|
||
}
|
||
var reader io.ReadCloser
|
||
if res.Header.Get("Content-Encoding") == "gzip" {
|
||
reader, err = gzip.NewReader(res.Body)
|
||
if err != nil {
|
||
return ""
|
||
}
|
||
} else {
|
||
reader = res.Body
|
||
}
|
||
//utf8Reader := transform.NewReader(reader,
|
||
// simplifiedchinese.GBK.NewDecoder())
|
||
robots, err := ioutil.ReadAll(reader)
|
||
res.Body.Close()
|
||
if err != nil {
|
||
log.Println(err)
|
||
return ""
|
||
}
|
||
return string(robots)
|
||
}
|
||
|
||
/**
|
||
去除html标签,过滤html标签
|
||
*/
|
||
func TrimHtml(src string) string {
|
||
//将HTML标签全转换成小写
|
||
re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
|
||
src = re.ReplaceAllStringFunc(src, strings.ToLower)
|
||
//去除STYLE
|
||
re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
|
||
src = re.ReplaceAllString(src, "")
|
||
//去除SCRIPT
|
||
re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
|
||
src = re.ReplaceAllString(src, "")
|
||
//去除所有尖括号内的HTML代码,并换成换行符
|
||
re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
|
||
src = re.ReplaceAllString(src, "")
|
||
//去除连续的换行符
|
||
re, _ = regexp.Compile("\\s{2,}")
|
||
src = re.ReplaceAllString(src, "")
|
||
return strings.TrimSpace(src)
|
||
}
|
||
|
||
// 通过map主键唯一的特性过滤重复元素
|
||
func RemoveRepByMap(slc []string) []string {
|
||
result := []string{}
|
||
tempMap := map[string]byte{} // 存放不重复主键
|
||
for _, e := range slc {
|
||
l := len(tempMap)
|
||
tempMap[e] = 0
|
||
if len(tempMap) != l { // 加入map后,map长度变化,则元素不重复
|
||
result = append(result, e)
|
||
}
|
||
}
|
||
return result
|
||
}
|