Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
535 lines
13 KiB
535 lines
13 KiB
package html2text |
|
|
|
import ( |
|
"bytes" |
|
"io" |
|
"regexp" |
|
"strings" |
|
"unicode" |
|
|
|
"github.com/olekukonko/tablewriter" |
|
"github.com/ssor/bom" |
|
"golang.org/x/net/html" |
|
"golang.org/x/net/html/atom" |
|
) |
|
|
|
// Options provide toggles and overrides to control specific rendering behaviors. |
|
type Options struct { |
|
PrettyTables bool // Turns on pretty ASCII rendering for table elements. |
|
PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements. |
|
OmitLinks bool // Turns on omitting links |
|
} |
|
|
|
// PrettyTablesOptions overrides tablewriter behaviors |
|
type PrettyTablesOptions struct { |
|
AutoFormatHeader bool |
|
AutoWrapText bool |
|
ReflowDuringAutoWrap bool |
|
ColWidth int |
|
ColumnSeparator string |
|
RowSeparator string |
|
CenterSeparator string |
|
HeaderAlignment int |
|
FooterAlignment int |
|
Alignment int |
|
ColumnAlignment []int |
|
NewLine string |
|
HeaderLine bool |
|
RowLine bool |
|
AutoMergeCells bool |
|
Borders tablewriter.Border |
|
} |
|
|
|
// NewPrettyTablesOptions creates PrettyTablesOptions with default settings |
|
func NewPrettyTablesOptions() *PrettyTablesOptions { |
|
return &PrettyTablesOptions{ |
|
AutoFormatHeader: true, |
|
AutoWrapText: true, |
|
ReflowDuringAutoWrap: true, |
|
ColWidth: tablewriter.MAX_ROW_WIDTH, |
|
ColumnSeparator: tablewriter.COLUMN, |
|
RowSeparator: tablewriter.ROW, |
|
CenterSeparator: tablewriter.CENTER, |
|
HeaderAlignment: tablewriter.ALIGN_DEFAULT, |
|
FooterAlignment: tablewriter.ALIGN_DEFAULT, |
|
Alignment: tablewriter.ALIGN_DEFAULT, |
|
ColumnAlignment: []int{}, |
|
NewLine: tablewriter.NEWLINE, |
|
HeaderLine: true, |
|
RowLine: false, |
|
AutoMergeCells: false, |
|
Borders: tablewriter.Border{Left: true, Right: true, Bottom: true, Top: true}, |
|
} |
|
} |
|
|
|
// FromHTMLNode renders text output from a pre-parsed HTML document. |
|
func FromHTMLNode(doc *html.Node, o ...Options) (string, error) { |
|
var options Options |
|
if len(o) > 0 { |
|
options = o[0] |
|
} |
|
|
|
ctx := textifyTraverseContext{ |
|
buf: bytes.Buffer{}, |
|
options: options, |
|
} |
|
if err := ctx.traverse(doc); err != nil { |
|
return "", err |
|
} |
|
|
|
text := strings.TrimSpace(newlineRe.ReplaceAllString( |
|
strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"), |
|
) |
|
return text, nil |
|
} |
|
|
|
// FromReader renders text output after parsing HTML for the specified |
|
// io.Reader. |
|
func FromReader(reader io.Reader, options ...Options) (string, error) { |
|
newReader, err := bom.NewReaderWithoutBom(reader) |
|
if err != nil { |
|
return "", err |
|
} |
|
doc, err := html.Parse(newReader) |
|
if err != nil { |
|
return "", err |
|
} |
|
return FromHTMLNode(doc, options...) |
|
} |
|
|
|
// FromString parses HTML from the input string, then renders the text form. |
|
func FromString(input string, options ...Options) (string, error) { |
|
bs := bom.CleanBom([]byte(input)) |
|
text, err := FromReader(bytes.NewReader(bs), options...) |
|
if err != nil { |
|
return "", err |
|
} |
|
return text, nil |
|
} |
|
|
|
var ( |
|
spacingRe = regexp.MustCompile(`[ \r\n\t]+`) |
|
newlineRe = regexp.MustCompile(`\n\n+`) |
|
) |
|
|
|
// traverseTableCtx holds text-related context. |
|
type textifyTraverseContext struct { |
|
buf bytes.Buffer |
|
|
|
prefix string |
|
tableCtx tableTraverseContext |
|
options Options |
|
endsWithSpace bool |
|
justClosedDiv bool |
|
blockquoteLevel int |
|
lineLength int |
|
isPre bool |
|
} |
|
|
|
// tableTraverseContext holds table ASCII-form related context. |
|
type tableTraverseContext struct { |
|
header []string |
|
body [][]string |
|
footer []string |
|
tmpRow int |
|
isInFooter bool |
|
} |
|
|
|
func (tableCtx *tableTraverseContext) init() { |
|
tableCtx.body = [][]string{} |
|
tableCtx.header = []string{} |
|
tableCtx.footer = []string{} |
|
tableCtx.isInFooter = false |
|
tableCtx.tmpRow = 0 |
|
} |
|
|
|
func (ctx *textifyTraverseContext) handleElement(node *html.Node) error { |
|
ctx.justClosedDiv = false |
|
|
|
switch node.DataAtom { |
|
case atom.Br: |
|
return ctx.emit("\n") |
|
|
|
case atom.H1, atom.H2, atom.H3: |
|
subCtx := textifyTraverseContext{} |
|
if err := subCtx.traverseChildren(node); err != nil { |
|
return err |
|
} |
|
|
|
str := subCtx.buf.String() |
|
dividerLen := 0 |
|
for _, line := range strings.Split(str, "\n") { |
|
if lineLen := len([]rune(line)); lineLen-1 > dividerLen { |
|
dividerLen = lineLen - 1 |
|
} |
|
} |
|
var divider string |
|
if node.DataAtom == atom.H1 { |
|
divider = strings.Repeat("*", dividerLen) |
|
} else { |
|
divider = strings.Repeat("-", dividerLen) |
|
} |
|
|
|
if node.DataAtom == atom.H3 { |
|
return ctx.emit("\n\n" + str + "\n" + divider + "\n\n") |
|
} |
|
return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n") |
|
|
|
case atom.Blockquote: |
|
ctx.blockquoteLevel++ |
|
ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " " |
|
if err := ctx.emit("\n"); err != nil { |
|
return err |
|
} |
|
if ctx.blockquoteLevel == 1 { |
|
if err := ctx.emit("\n"); err != nil { |
|
return err |
|
} |
|
} |
|
if err := ctx.traverseChildren(node); err != nil { |
|
return err |
|
} |
|
ctx.blockquoteLevel-- |
|
ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) |
|
if ctx.blockquoteLevel > 0 { |
|
ctx.prefix += " " |
|
} |
|
return ctx.emit("\n\n") |
|
|
|
case atom.Div: |
|
if ctx.lineLength > 0 { |
|
if err := ctx.emit("\n"); err != nil { |
|
return err |
|
} |
|
} |
|
if err := ctx.traverseChildren(node); err != nil { |
|
return err |
|
} |
|
var err error |
|
if !ctx.justClosedDiv { |
|
err = ctx.emit("\n") |
|
} |
|
ctx.justClosedDiv = true |
|
return err |
|
|
|
case atom.Li: |
|
if err := ctx.emit("* "); err != nil { |
|
return err |
|
} |
|
|
|
if err := ctx.traverseChildren(node); err != nil { |
|
return err |
|
} |
|
|
|
return ctx.emit("\n") |
|
|
|
case atom.B, atom.Strong: |
|
subCtx := textifyTraverseContext{} |
|
subCtx.endsWithSpace = true |
|
if err := subCtx.traverseChildren(node); err != nil { |
|
return err |
|
} |
|
str := subCtx.buf.String() |
|
return ctx.emit("*" + str + "*") |
|
|
|
case atom.A: |
|
linkText := "" |
|
// For simple link element content with single text node only, peek at the link text. |
|
if node.FirstChild != nil && node.FirstChild.NextSibling == nil && node.FirstChild.Type == html.TextNode { |
|
linkText = node.FirstChild.Data |
|
} |
|
|
|
// If image is the only child, take its alt text as the link text. |
|
if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img { |
|
if altText := getAttrVal(img, "alt"); altText != "" { |
|
if err := ctx.emit(altText); err != nil { |
|
return err |
|
} |
|
} |
|
} else if err := ctx.traverseChildren(node); err != nil { |
|
return err |
|
} |
|
|
|
hrefLink := "" |
|
if attrVal := getAttrVal(node, "href"); attrVal != "" { |
|
attrVal = ctx.normalizeHrefLink(attrVal) |
|
// Don't print link href if it matches link element content or if the link is empty. |
|
if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal { |
|
hrefLink = "( " + attrVal + " )" |
|
} |
|
} |
|
|
|
return ctx.emit(hrefLink) |
|
|
|
case atom.P, atom.Ul: |
|
return ctx.paragraphHandler(node) |
|
|
|
case atom.Table, atom.Tfoot, atom.Th, atom.Tr, atom.Td: |
|
if ctx.options.PrettyTables { |
|
return ctx.handleTableElement(node) |
|
} else if node.DataAtom == atom.Table { |
|
return ctx.paragraphHandler(node) |
|
} |
|
return ctx.traverseChildren(node) |
|
|
|
case atom.Pre: |
|
ctx.isPre = true |
|
err := ctx.traverseChildren(node) |
|
ctx.isPre = false |
|
return err |
|
|
|
case atom.Style, atom.Script, atom.Head: |
|
// Ignore the subtree. |
|
return nil |
|
|
|
default: |
|
return ctx.traverseChildren(node) |
|
} |
|
} |
|
|
|
// paragraphHandler renders node children surrounded by double newlines. |
|
func (ctx *textifyTraverseContext) paragraphHandler(node *html.Node) error { |
|
if err := ctx.emit("\n\n"); err != nil { |
|
return err |
|
} |
|
if err := ctx.traverseChildren(node); err != nil { |
|
return err |
|
} |
|
return ctx.emit("\n\n") |
|
} |
|
|
|
// handleTableElement is only to be invoked when options.PrettyTables is active. |
|
func (ctx *textifyTraverseContext) handleTableElement(node *html.Node) error { |
|
if !ctx.options.PrettyTables { |
|
panic("handleTableElement invoked when PrettyTables not active") |
|
} |
|
|
|
switch node.DataAtom { |
|
case atom.Table: |
|
if err := ctx.emit("\n\n"); err != nil { |
|
return err |
|
} |
|
|
|
// Re-intialize all table context. |
|
ctx.tableCtx.init() |
|
|
|
// Browse children, enriching context with table data. |
|
if err := ctx.traverseChildren(node); err != nil { |
|
return err |
|
} |
|
|
|
buf := &bytes.Buffer{} |
|
table := tablewriter.NewWriter(buf) |
|
if ctx.options.PrettyTablesOptions != nil { |
|
options := ctx.options.PrettyTablesOptions |
|
table.SetAutoFormatHeaders(options.AutoFormatHeader) |
|
table.SetAutoWrapText(options.AutoWrapText) |
|
table.SetReflowDuringAutoWrap(options.ReflowDuringAutoWrap) |
|
table.SetColWidth(options.ColWidth) |
|
table.SetColumnSeparator(options.ColumnSeparator) |
|
table.SetRowSeparator(options.RowSeparator) |
|
table.SetCenterSeparator(options.CenterSeparator) |
|
table.SetHeaderAlignment(options.HeaderAlignment) |
|
table.SetFooterAlignment(options.FooterAlignment) |
|
table.SetAlignment(options.Alignment) |
|
table.SetColumnAlignment(options.ColumnAlignment) |
|
table.SetNewLine(options.NewLine) |
|
table.SetHeaderLine(options.HeaderLine) |
|
table.SetRowLine(options.RowLine) |
|
table.SetAutoMergeCells(options.AutoMergeCells) |
|
table.SetBorders(options.Borders) |
|
} |
|
table.SetHeader(ctx.tableCtx.header) |
|
table.SetFooter(ctx.tableCtx.footer) |
|
table.AppendBulk(ctx.tableCtx.body) |
|
|
|
// Render the table using ASCII. |
|
table.Render() |
|
if err := ctx.emit(buf.String()); err != nil { |
|
return err |
|
} |
|
|
|
return ctx.emit("\n\n") |
|
|
|
case atom.Tfoot: |
|
ctx.tableCtx.isInFooter = true |
|
if err := ctx.traverseChildren(node); err != nil { |
|
return err |
|
} |
|
ctx.tableCtx.isInFooter = false |
|
|
|
case atom.Tr: |
|
ctx.tableCtx.body = append(ctx.tableCtx.body, []string{}) |
|
if err := ctx.traverseChildren(node); err != nil { |
|
return err |
|
} |
|
ctx.tableCtx.tmpRow++ |
|
|
|
case atom.Th: |
|
res, err := ctx.renderEachChild(node) |
|
if err != nil { |
|
return err |
|
} |
|
|
|
ctx.tableCtx.header = append(ctx.tableCtx.header, res) |
|
|
|
case atom.Td: |
|
res, err := ctx.renderEachChild(node) |
|
if err != nil { |
|
return err |
|
} |
|
|
|
if ctx.tableCtx.isInFooter { |
|
ctx.tableCtx.footer = append(ctx.tableCtx.footer, res) |
|
} else { |
|
ctx.tableCtx.body[ctx.tableCtx.tmpRow] = append(ctx.tableCtx.body[ctx.tableCtx.tmpRow], res) |
|
} |
|
|
|
} |
|
return nil |
|
} |
|
|
|
func (ctx *textifyTraverseContext) traverse(node *html.Node) error { |
|
switch node.Type { |
|
default: |
|
return ctx.traverseChildren(node) |
|
|
|
case html.TextNode: |
|
var data string |
|
if ctx.isPre { |
|
data = node.Data |
|
} else { |
|
data = strings.TrimSpace(spacingRe.ReplaceAllString(node.Data, " ")) |
|
} |
|
return ctx.emit(data) |
|
|
|
case html.ElementNode: |
|
return ctx.handleElement(node) |
|
} |
|
} |
|
|
|
func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error { |
|
for c := node.FirstChild; c != nil; c = c.NextSibling { |
|
if err := ctx.traverse(c); err != nil { |
|
return err |
|
} |
|
} |
|
|
|
return nil |
|
} |
|
|
|
func (ctx *textifyTraverseContext) emit(data string) error { |
|
if data == "" { |
|
return nil |
|
} |
|
var ( |
|
lines = ctx.breakLongLines(data) |
|
err error |
|
) |
|
for _, line := range lines { |
|
runes := []rune(line) |
|
startsWithSpace := unicode.IsSpace(runes[0]) |
|
if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") { |
|
if err = ctx.buf.WriteByte(' '); err != nil { |
|
return err |
|
} |
|
ctx.lineLength++ |
|
} |
|
ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) |
|
for _, c := range line { |
|
if _, err = ctx.buf.WriteString(string(c)); err != nil { |
|
return err |
|
} |
|
ctx.lineLength++ |
|
if c == '\n' { |
|
ctx.lineLength = 0 |
|
if ctx.prefix != "" { |
|
if _, err = ctx.buf.WriteString(ctx.prefix); err != nil { |
|
return err |
|
} |
|
} |
|
} |
|
} |
|
} |
|
return nil |
|
} |
|
|
|
const maxLineLen = 74 |
|
|
|
func (ctx *textifyTraverseContext) breakLongLines(data string) []string { |
|
// Only break lines when in blockquotes. |
|
if ctx.blockquoteLevel == 0 { |
|
return []string{data} |
|
} |
|
var ( |
|
ret = []string{} |
|
runes = []rune(data) |
|
l = len(runes) |
|
existing = ctx.lineLength |
|
) |
|
if existing >= maxLineLen { |
|
ret = append(ret, "\n") |
|
existing = 0 |
|
} |
|
for l+existing > maxLineLen { |
|
i := maxLineLen - existing |
|
for i >= 0 && !unicode.IsSpace(runes[i]) { |
|
i-- |
|
} |
|
if i == -1 { |
|
// No spaces, so go the other way. |
|
i = maxLineLen - existing |
|
for i < l && !unicode.IsSpace(runes[i]) { |
|
i++ |
|
} |
|
} |
|
ret = append(ret, string(runes[:i])+"\n") |
|
for i < l && unicode.IsSpace(runes[i]) { |
|
i++ |
|
} |
|
runes = runes[i:] |
|
l = len(runes) |
|
existing = 0 |
|
} |
|
if len(runes) > 0 { |
|
ret = append(ret, string(runes)) |
|
} |
|
return ret |
|
} |
|
|
|
func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string { |
|
link = strings.TrimSpace(link) |
|
link = strings.TrimPrefix(link, "mailto:") |
|
return link |
|
} |
|
|
|
// renderEachChild visits each direct child of a node and collects the sequence of |
|
// textuual representaitons separated by a single newline. |
|
func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) { |
|
buf := &bytes.Buffer{} |
|
for c := node.FirstChild; c != nil; c = c.NextSibling { |
|
s, err := FromHTMLNode(c, ctx.options) |
|
if err != nil { |
|
return "", err |
|
} |
|
if _, err = buf.WriteString(s); err != nil { |
|
return "", err |
|
} |
|
if c.NextSibling != nil { |
|
if err = buf.WriteByte('\n'); err != nil { |
|
return "", err |
|
} |
|
} |
|
} |
|
return buf.String(), nil |
|
} |
|
|
|
func getAttrVal(node *html.Node, attrName string) string { |
|
for _, attr := range node.Attr { |
|
if attr.Key == attrName { |
|
return attr.Val |
|
} |
|
} |
|
|
|
return "" |
|
}
|
|
|