package uk.ac.warwick.util.content.cleaner;

import com.google.common.collect.Lists;
import java.io.IOException;
import java.io.StringReader;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.ccil.cowan.tagsoup.Parser;
import org.ccil.cowan.tagsoup.Schema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import uk.ac.warwick.html5.HTML5Schema;
import uk.ac.warwick.util.collections.Pair;
import uk.ac.warwick.util.collections.Triple;
import uk.ac.warwick.util.content.MutableContent;
import uk.ac.warwick.util.content.textile2.lite.TextileConstants;
import uk.ac.warwick.util.content.texttransformers.NewWindowLinkTextTransformer;
import uk.ac.warwick.util.core.ObjectProvider;

/* loaded from: input_file:uk/ac/warwick/util/content/cleaner/HtmlCleaner.class */
public final class HtmlCleaner implements Cleaner {
    public static final Logger LOGGER = LoggerFactory.getLogger(HtmlCleaner.class);
    private final List<Pair<String, String>> straightReplacements;
    private final List<Pair<String, String>> postParseStraightReplacements;
    private final List<Triple<Pattern, String, String>> regexReplacements;
    private final List<Triple<Pattern, String, String>> postParseRegexReplacements;
    private final HtmlContentWriter contentWriter;
    private ObjectProvider<TagAndAttributeFilter> filterProvider;
    private boolean allowJavascriptHandlers;
    private boolean allowBlockquoteWithNoAttributes;
    private Schema schema;

    /* loaded from: input_file:uk/ac/warwick/util/content/cleaner/HtmlCleaner$ContentType.class */
    enum ContentType {
        none,
        elementStart,
        elementEnd,
        characters,
        whitespace
    }

    public HtmlCleaner() {
        this(null);
    }

    public HtmlCleaner(HtmlContentWriter htmlContentWriter) {
        this.filterProvider = new ObjectProvider<TagAndAttributeFilter>() { // from class: uk.ac.warwick.util.content.cleaner.HtmlCleaner.1
            /* renamed from: newInstance, reason: merged with bridge method [inline-methods] */
            public TagAndAttributeFilter m9newInstance() {
                return new TagAndAttributeFilterImpl();
            }
        };
        this.allowJavascriptHandlers = true;
        this.schema = new HTML5Schema();
        this.contentWriter = htmlContentWriter;
        this.straightReplacements = Lists.newArrayList();
        this.straightReplacements.add(Pair.of("_mce_thref=", "href="));
        this.straightReplacements.add(Pair.of("_mce_tsrc=", "src="));
        this.straightReplacements.add(Pair.of("mce_thref=", "href="));
        this.straightReplacements.add(Pair.of("mce_tsrc=", "src="));
        this.straightReplacements.add(Pair.of(NewWindowLinkTextTransformer.HTML_IMAGE, TextileConstants.EXP_PHRASE_MODIFIER));
        this.straightReplacements.add(Pair.of("<i class='new-window-link' title='Link opens in a new window'></i>", TextileConstants.EXP_PHRASE_MODIFIER));
        this.straightReplacements.add(Pair.of("·", "&#183;"));
        this.straightReplacements.add(Pair.of("&#65279;", TextileConstants.EXP_PHRASE_MODIFIER));
        this.regexReplacements = Lists.newArrayList();
        this.regexReplacements.add(Triple.of(Pattern.compile("&nbsp;(&nbsp;)+"), "&nbsp;", "&nbsp;"));
        this.regexReplacements.add(Triple.of(Pattern.compile(">(&nbsp;| )*&nbsp;(&nbsp;| )*<"), "&nbsp;", ">_NONBREAKINGSPACE_<"));
        this.regexReplacements.add(Triple.of(Pattern.compile("&nbsp;"), "&nbsp;", " "));
        this.regexReplacements.add(Triple.of(Pattern.compile("_NONBREAKINGSPACE_"), "_nonbreakingspace_", "&nbsp;"));
        this.regexReplacements.add(Triple.of(Pattern.compile("<!--\\[if [a-z]+ mso \\d*\\]>.*?<!\\-*\\[endif\\].*?-->", 34), "[endif]", TextileConstants.EXP_PHRASE_MODIFIER));
        this.regexReplacements.add(Triple.of(Pattern.compile("<!--\\[if supportFields\\]>.*?<!\\[endif\\]-->", 34), "[if supportfields]", TextileConstants.EXP_PHRASE_MODIFIER));
        this.regexReplacements.add(Triple.of(Pattern.compile("<!--\\[if !mso\\]>.*?<!-*\\[endif\\]-->", 34), "[if !mso]", TextileConstants.EXP_PHRASE_MODIFIER));
        this.regexReplacements.add(Triple.of(Pattern.compile("<!--\\[if gte vml 1\\]>.*?<!\\[endif\\]-->", 34), "[if gte vml 1]", TextileConstants.EXP_PHRASE_MODIFIER));
        this.regexReplacements.add(Triple.of(Pattern.compile("<br _?mce_bogus=\"?1\"?\\s*/?>", 2), "_bogus", TextileConstants.EXP_PHRASE_MODIFIER));
        this.regexReplacements.add(Triple.of(Pattern.compile("<br data-mce-bogus=\"?1\"?\\s*/?>", 2), "data-mce-bogus", TextileConstants.EXP_PHRASE_MODIFIER));
        this.regexReplacements.add(Triple.of(Pattern.compile("<mce:style([^>]*)>\\<\\!\\-\\-(.*?)\\-\\-\\></mce:style>", 34), "</mce:style>", "<style$1>$2</style>"));
        this.regexReplacements.add(Triple.of(Pattern.compile("<style[^>]* _?mce_bogus=\"?1\"?\\s*>.*?</style>", 34), "</style>", TextileConstants.EXP_PHRASE_MODIFIER));
        this.regexReplacements.add(Triple.of(Pattern.compile("<style[^>]* data-mce-bogus=\"?1\"?\\s*>.*?</style>", 34), "</style>", TextileConstants.EXP_PHRASE_MODIFIER));
        this.regexReplacements.add(Triple.of(Pattern.compile("<mce\\:([a-z]*)([^>]*)>(.*?)<\\/mce\\:\\1>", 34), "<mce:", "<$1$2>$3</$1>"));
        this.regexReplacements.add(Triple.of(Pattern.compile("<p>\\s*(<script.*?<\\/script>)\\s*</p>", 34), "</script>", TextileConstants.REPLACE_ESCAPED_GLYPHS));
        this.regexReplacements.add(Triple.of(Pattern.compile("(<t[dh][^>]*)\\salign=[\"']?middle[\"']?", 2), "middle", "$1 align=\"center\""));
        this.regexReplacements.add(Triple.of(Pattern.compile("(<t[dh][^>]*>)\\s*(</t[dh]>)", 34), "</t", "$1&nbsp;$2"));
        this.regexReplacements.add(Triple.of(Pattern.compile("<p>\\s*(<!--.*?-->)\\s*</p>", 34), "<!--", TextileConstants.REPLACE_ESCAPED_GLYPHS));
        this.regexReplacements.add(Triple.of(Pattern.compile("(mce-)+text/javascript", 34), "<script", "text/javascript"));
        this.regexReplacements.add(Triple.of(Pattern.compile("<p>(.*?)<meta[^>]+>(.*?)</p>", 34), "<meta", "<p>$1$2</p>"));
        this.regexReplacements.add(Triple.of(Pattern.compile("<p>(.*?)<title>[^<]*</title>(.*?)</p>", 34), "</title>", "<p>$1$2</p>"));
        this.regexReplacements.add(Triple.of(Pattern.compile("<p>(.*?)<link[^>]+>(?:</link>)?(.*?)</p>", 34), "<link", "<p>$1$2</p>"));
        this.regexReplacements.add(Triple.of(Pattern.compile("<p[^>]*class=\"?Mso(?:[A-Z][a-z]+)+\"?[^>]*>(?:<!--\\[if !supportLists\\]-->)?(?:<\\/?(?:span|font)[^>]*>)*(?:&#183;|·)(?:<\\/?(?:span|font)[^>]*>)*(?:&nbsp;)*\\s*(?:<\\/?(?:span|font)[^>]*>)*(?:<!--\\[endif\\]-->)?(.*?)(?:<\\/?(?:span|font)[^>]*>)*</p>", 34), "&#183;", "<li>$1</li>"));
        this.postParseStraightReplacements = Lists.newArrayList();
        this.postParseStraightReplacements.add(Pair.of("<b></b>", TextileConstants.EXP_PHRASE_MODIFIER));
        this.postParseStraightReplacements.add(Pair.of("<strong></strong>", TextileConstants.EXP_PHRASE_MODIFIER));
        this.postParseRegexReplacements = Lists.newArrayList();
        this.postParseRegexReplacements.add(Triple.of(Pattern.compile("<p>\\s*</p>\n*"), "</p>", TextileConstants.EXP_PHRASE_MODIFIER));
        this.regexReplacements.add(Triple.of(Pattern.compile("\\bstyle=(\"padding-left:\\s*\\d{2,}px;?\\s*\")", 2), "padding-left", "tinymce_indent=$1"));
        this.postParseRegexReplacements.add(Triple.of(Pattern.compile("\\btinymce_indent=(\"padding-left:\\s*\\d{2,}px;?\\s*\")(?:\\sstyle=\"[^\"]*\")?", 2), "tinymce_indent", "style=$1"));
        this.postParseRegexReplacements.add(Triple.of(Pattern.compile("<table\\sstyle=\"padding(-left:\\s*\\d{2,}px;?\\s*)\"", 2), "<table", "<table style=\"margin$1\""));
        this.postParseRegexReplacements.add(Triple.of(Pattern.compile("\\s*<p>\\s*(<br\\s*/?>)?\\s*</p>\\s*$", 34), "</p>", TextileConstants.EXP_PHRASE_MODIFIER));
        this.postParseRegexReplacements.add(Triple.of(Pattern.compile("<a [^>]+rel=\"lightbox\\[[^>]+></a>", 34), "lightbox[", TextileConstants.EXP_PHRASE_MODIFIER));
    }

    @Override // uk.ac.warwick.util.content.cleaner.Cleaner
    public String clean(String str, MutableContent mutableContent) {
        String doPreParsingCleanup = doPreParsingCleanup(str);
        Parser parser = new Parser();
        TagAndAttributeFilter tagAndAttributeFilter = (TagAndAttributeFilter) this.filterProvider.newInstance();
        tagAndAttributeFilter.setAllowJavascriptHandlers(isAllowJavascriptHandlers());
        tagAndAttributeFilter.setAllowBlockquoteWithNoAttributes(isAllowBlockquoteWithNoAttributes());
        CleanerWriter cleanerWriter = new CleanerWriter(tagAndAttributeFilter, mutableContent);
        if (this.contentWriter != null) {
            this.contentWriter.setDelegate(cleanerWriter.getContentWriter());
            cleanerWriter.setContentWriter(this.contentWriter);
        }
        try {
            InputSource inputSource = new InputSource(new StringReader(doPreParsingCleanup));
            parser.setFeature("http://www.ccil.org/~cowan/tagsoup/features/default-attributes", false);
            parser.setContentHandler(cleanerWriter);
            parser.setProperty("http://xml.org/sax/properties/lexical-handler", cleanerWriter);
            parser.setProperty("http://www.ccil.org/~cowan/tagsoup/properties/schema", this.schema);
            parser.parse(inputSource);
            return doPostParsingCleanup(cleanerWriter.getOutput());
        } catch (IOException e) {
            throw new IllegalStateException(e);
        } catch (SAXException e2) {
            throw new IllegalStateException("HTML cleanup error", e2);
        }
    }

    String doPreParsingCleanup(String str) {
        String encodeLoneTags = encodeLoneTags(str);
        for (Pair<String, String> pair : this.straightReplacements) {
            encodeLoneTags = encodeLoneTags.replace((CharSequence) pair.getLeft(), (CharSequence) pair.getRight());
        }
        for (Triple<Pattern, String, String> triple : this.regexReplacements) {
            if (encodeLoneTags.toLowerCase().indexOf((String) triple.getMiddle()) != -1) {
                int i = 10;
                while (((Pattern) triple.getLeft()).matcher(encodeLoneTags).find()) {
                    int i2 = i;
                    i--;
                    if (i2 > 0) {
                        encodeLoneTags = ((Pattern) triple.getLeft()).matcher(encodeLoneTags).replaceAll((String) triple.getRight());
                    }
                }
            }
        }
        return doOfficeStyles(doComplexOfficeTags(encodeLoneTags)).replaceAll("<!--\\[(.+?)]-->", TextileConstants.EXP_PHRASE_MODIFIER);
    }

    private String doComplexOfficeTags(String str) {
        if (str.indexOf("Mso") != -1 && str.indexOf("</o:p>") != -1) {
            Pattern compile = Pattern.compile("<p[^>]*class=\"?Mso[a-z]+\"?[^>]*>(.*?)</p>", 2);
            Pattern compile2 = Pattern.compile("(?:<\\?xml[^>]*>)?(?:<b style[^>]*>)?<o:p>(?:<font[^>]*>)?&nbsp;(?:</font>)?</o:p>(</b>)?", 2);
            Pattern compile3 = Pattern.compile("<span [^>]*mce_name=\"strong\"[^>]*><o:p>(?:<font[^>]*>)?&nbsp;(?:</font>)?</o:p></span>", 2);
            Matcher matcher = compile.matcher(str);
            StringBuilder sb = new StringBuilder();
            int i = 0;
            int i2 = 0;
            while (matcher.find()) {
                int start = matcher.start();
                i2 = matcher.end();
                sb.append(str.substring(i, start));
                String substring = str.substring(start, i2);
                Matcher matcher2 = compile2.matcher(matcher.group(1));
                Matcher matcher3 = compile3.matcher(matcher.group(1));
                if (!matcher2.matches() && !matcher3.matches()) {
                    sb.append(substring);
                }
                i = i2;
            }
            sb.append(str.substring(i2));
            str = sb.toString();
        }
        return str;
    }

    private String doOfficeStyles(String str) {
        if ((str.toLowerCase().indexOf("<style") != -1 || str.toLowerCase().indexOf("<mce:style") != -1) && str.indexOf("mso-") != -1) {
            Pattern compile = Pattern.compile("<(?:mce\\:)?style[^>]*>(.*?)</(?:mce\\:)?style>\\s*", 34);
            Pattern compile2 = Pattern.compile("^\\s*mso-.*$", 8);
            Matcher matcher = compile.matcher(str);
            StringBuilder sb = new StringBuilder();
            int i = 0;
            int i2 = 0;
            while (matcher.find()) {
                int start = matcher.start();
                i2 = matcher.end();
                sb.append(str.substring(i, start));
                String substring = str.substring(start, i2);
                if (!compile2.matcher(matcher.group(1)).find()) {
                    sb.append(substring);
                }
                i = i2;
            }
            sb.append(str.substring(i2));
            str = sb.toString();
        }
        return str;
    }

    private String doPostParsingCleanup(String str) {
        String str2 = str;
        for (Pair<String, String> pair : this.postParseStraightReplacements) {
            str2 = str2.replace((CharSequence) pair.getLeft(), (CharSequence) pair.getRight());
        }
        for (Triple<Pattern, String, String> triple : this.postParseRegexReplacements) {
            int i = 10;
            while (str2.toLowerCase().indexOf((String) triple.getMiddle()) != -1 && ((Pattern) triple.getLeft()).matcher(str2).find()) {
                int i2 = i;
                i--;
                if (i2 > 0) {
                    str2 = ((Pattern) triple.getLeft()).matcher(str2).replaceAll((String) triple.getRight());
                }
            }
        }
        return str2;
    }

    String encodeLoneTags(String str) {
        Matcher matcher = Pattern.compile("<script[^>]*>(.*?)</script>", 34).matcher(str);
        StringBuilder sb = new StringBuilder();
        int i = 0;
        int i2 = 0;
        while (matcher.find()) {
            int start = matcher.start();
            i2 = matcher.end();
            sb.append(doEscaping(str.substring(i, start)));
            sb.append(str.substring(start, i2));
            i = i2;
        }
        sb.append(doEscaping(str.substring(i2)));
        return sb.toString();
    }

    String doEscaping(String str) {
        return Pattern.compile("<([^a-zA-Z?!/])").matcher(str).replaceAll("&lt;$1");
    }

    public boolean isAllowJavascriptHandlers() {
        return this.allowJavascriptHandlers;
    }

    public void setAllowJavascriptHandlers(boolean z) {
        this.allowJavascriptHandlers = z;
    }

    public boolean isAllowBlockquoteWithNoAttributes() {
        return this.allowJavascriptHandlers;
    }

    public void setAllowBlockquoteWithNoAttributes(boolean z) {
        this.allowBlockquoteWithNoAttributes = z;
    }

    public void setFilterProvider(ObjectProvider<TagAndAttributeFilter> objectProvider) {
        this.filterProvider = objectProvider;
    }

    public void setSchema(Schema schema) {
        this.schema = schema;
    }
}
