package utils import ( "baliance.com/gooxml/document" "bytes" "compress/gzip" "context" "github.com/google/go-tika/tika" "github.com/ledongthuc/pdf" "github.com/tealeg/xlsx" "io" "io/ioutil" "log" "net/http" "os" "regexp" "strings" ) func ReadPdfAll2(path string) (string, error) { f, err := os.Open(path) defer f.Close() if err != nil { return "", err } client := tika.NewClient(nil, "http://127.0.0.1:9998") res, err := client.Parse(context.TODO(), f) return TrimHtml(res), err } //读取Excel全部数据 func ReadExcelAll(excelPath string) ([]string, error) { // 打开 Excel 文件 xlFile, err := xlsx.OpenFile(excelPath) if err != nil { return nil, err } texts := make([]string, 0) // 遍历每个 Sheet for _, sheet := range xlFile.Sheets { // 遍历每行数据 for _, row := range sheet.Rows { line := "" // 遍历每个单元格 for _, cell := range row.Cells { // 输出单元格的值 line += cell.Value } if line == "" { continue } texts = append(texts, line) } } return texts, nil } func ReadDocxAll(fileName string) (string, error) { doc, err := document.Open(fileName) if err != nil { return "", err } text := "" for _, para := range doc.Paragraphs() { //run为每个段落相同格式的文字组成的片段 for _, run := range para.Runs() { text += run.Text() } } return text, nil } //读取pdf文字内容 func ReadPdfAll(path string) (string, error) { f, r, err := pdf.Open(path) text := "" // remember close file defer f.Close() if err != nil { if err != nil { text, err = ReadPdfAll2(path) } return text, err } var buf bytes.Buffer b, err := r.GetPlainText() if err != nil { if err != nil { text, err = ReadPdfAll2(path) } return text, err } buf.ReadFrom(b) return TrimHtml(buf.String()), nil } func HttpGet(url string) string { res, err := http.Get(url) if err != nil { log.Println(err) return "" } var reader io.ReadCloser if res.Header.Get("Content-Encoding") == "gzip" { reader, err = gzip.NewReader(res.Body) if err != nil { return "" } } else { reader = res.Body } //utf8Reader := transform.NewReader(reader, // simplifiedchinese.GBK.NewDecoder()) robots, err := ioutil.ReadAll(reader) res.Body.Close() if err != nil { log.Println(err) return "" } return string(robots) } /** 去除html标签,过滤html标签 */ func TrimHtml(src string) string { //将HTML标签全转换成小写 re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") src = re.ReplaceAllStringFunc(src, strings.ToLower) //去除STYLE re, _ = regexp.Compile("\\") src = re.ReplaceAllString(src, "") //去除SCRIPT re, _ = regexp.Compile("\\") src = re.ReplaceAllString(src, "") //去除所有尖括号内的HTML代码,并换成换行符 re, _ = regexp.Compile("\\<[\\S\\s]+?\\>") src = re.ReplaceAllString(src, "") //去除连续的换行符 re, _ = regexp.Compile("\\s{2,}") src = re.ReplaceAllString(src, "") return strings.TrimSpace(src) } // 通过map主键唯一的特性过滤重复元素 func RemoveRepByMap(slc []string) []string { result := []string{} tempMap := map[string]byte{} // 存放不重复主键 for _, e := range slc { l := len(tempMap) tempMap[e] = 0 if len(tempMap) != l { // 加入map后,map长度变化,则元素不重复 result = append(result, e) } } return result }