package org.apache.tika.parser.csv;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import org.apache.commons.io.ByteOrderMark;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;

/* loaded from: input_file:org/apache/tika/parser/csv/TextAndCSVParserTest.class */
public class TextAndCSVParserTest extends TikaTest {
    private static byte[] CSV_UTF8 = "the,quick,brown\tfox\njumped \tover,the\tlazy,\tdog\nand then,ran,down\tthe\tstreet".getBytes(StandardCharsets.UTF_8);
    private static byte[] CSV_UTF_16LE = "the,quick,brown\tfox\njumped \tover,the\tlazy,\tdog\nand then,ran,down\tthe\tstreet".getBytes(StandardCharsets.UTF_16LE);
    private static byte[] TSV_UTF8 = "the\tquick\tbrown,fox\njumped ,over\tthe,lazy\t,dog\nand then\tran\tdown,the,street".getBytes(StandardCharsets.UTF_8);
    private static byte[] TSV_UTF_16LE = "the\tquick\tbrown,fox\njumped ,over\tthe,lazy\t,dog\nand then\tran\tdown,the,street".getBytes(StandardCharsets.UTF_16LE);
    private static String EXPECTED_TSV = "<table><tr> <td>the</td> <td>quick</td> <td>brown,fox</td></tr>\n<tr> <td>jumped ,over</td> <td>the,lazy</td> <td>,dog</td></tr>\n<tr> <td>and then</td> <td>ran</td> <td>down,the,street</td></tr>\n</table>".replaceAll("[\r\n\t ]+", " ");
    private static String EXPECTED_CSV = EXPECTED_TSV.replaceAll(",+", " ");
    private static Parser PARSER;

    @BeforeAll
    public static void setUp() throws Exception {
        InputStream resourceAsStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("org/apache/tika/parser/csv/tika-config.xml");
        try {
            PARSER = new AutoDetectParser(new TikaConfig(resourceAsStream));
            if (resourceAsStream != null) {
                resourceAsStream.close();
            }
        } catch (Throwable th) {
            if (resourceAsStream != null) {
                try {
                    resourceAsStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    private static void assertMediaTypeEquals(String str, String str2, String str3, String str4) {
        if (str4 == null) {
            Assertions.fail("media type string must not be null");
        }
        Assertions.assertEquals(mediaType(str, str2, str3), MediaType.parse(str4));
    }

    private static MediaType mediaType(String str, String str2, String str3) {
        HashMap hashMap = new HashMap();
        hashMap.put("charset", str2);
        hashMap.put("delimiter", str3);
        return new MediaType(MediaType.text(str), hashMap);
    }

    private static byte[] concat(byte[] bArr, byte[] bArr2) {
        byte[] bArr3 = new byte[bArr.length + bArr2.length];
        System.arraycopy(bArr, 0, bArr3, 0, bArr.length);
        System.arraycopy(bArr2, 0, bArr3, bArr.length, bArr2.length);
        return bArr3;
    }

    @Test
    public void testCSV_UTF8() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set("resourceName", "test.csv");
        TikaTest.XMLResult xml = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
        Assertions.assertEquals("comma", xml.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
        assertMediaTypeEquals("csv", "ISO-8859-1", "comma", xml.metadata.get("Content-Type"));
        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xml.xml);
        Assertions.assertEquals(3, metadata.getInt(TextAndCSVParser.NUM_COLUMNS));
        Assertions.assertEquals(3, metadata.getInt(TextAndCSVParser.NUM_ROWS));
    }

    @Test
    public void testCSV_UTF8_TypeOverride() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE, "text/csv; charset=UTF-8");
        TikaTest.XMLResult xml = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
        Assertions.assertEquals("comma", xml.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
        assertMediaTypeEquals("csv", "UTF-8", "comma", xml.metadata.get("Content-Type"));
        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xml.xml);
    }

    @Test
    public void testCSV_UTF8_Type() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set("Content-Type", "text/csv");
        TikaTest.XMLResult xml = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
        Assertions.assertEquals("comma", xml.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
        assertMediaTypeEquals("csv", "ISO-8859-1", "comma", xml.metadata.get("Content-Type"));
        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xml.xml);
    }

    @Test
    public void testCSV_UTF16LE() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set("resourceName", "test.csv");
        TikaTest.XMLResult xml = getXML(new ByteArrayInputStream(CSV_UTF_16LE), PARSER, metadata);
        Assertions.assertEquals("comma", xml.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
        assertMediaTypeEquals("csv", "UTF-16LE", "comma", xml.metadata.get("Content-Type"));
        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xml.xml);
    }

    @Test
    public void testCSV_UTF16LE_BOM() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set("resourceName", "test.csv");
        TikaTest.XMLResult xml = getXML(new ByteArrayInputStream(concat(ByteOrderMark.UTF_16LE.getBytes(), CSV_UTF_16LE)), PARSER, metadata);
        Assertions.assertEquals("comma", xml.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
        assertMediaTypeEquals("csv", "UTF-16LE", "comma", xml.metadata.get("Content-Type"));
        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xml.xml);
    }

    @Test
    public void testTSV_UTF8() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set("resourceName", "test.csv");
        TikaTest.XMLResult xml = getXML(new ByteArrayInputStream(TSV_UTF8), PARSER, metadata);
        Assertions.assertEquals("tab", xml.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
        assertMediaTypeEquals("tsv", "ISO-8859-1", "tab", xml.metadata.get("Content-Type"));
        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xml.xml);
    }

    @Test
    public void testTSV_UTF16LE() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set("resourceName", "test.csv");
        TikaTest.XMLResult xml = getXML(new ByteArrayInputStream(TSV_UTF_16LE), PARSER, metadata);
        Assertions.assertEquals("tab", xml.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
        assertMediaTypeEquals("tsv", "UTF-16LE", "tab", xml.metadata.get("Content-Type"));
        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xml.xml);
    }

    @Test
    public void testBadCsv() throws Exception {
        byte[] bytes = "the,quick\nbrown,\"la\"zy\"\nbrown,\"dog\n".getBytes(StandardCharsets.UTF_8);
        Metadata metadata = new Metadata();
        metadata.set("resourceName", "test.csv");
        TikaTest.XMLResult xml = getXML(new ByteArrayInputStream(bytes), PARSER, metadata);
        Assertions.assertNull(xml.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
        Assertions.assertEquals("text/plain; charset=ISO-8859-1", xml.metadata.get("Content-Type"));
        assertContains("the,quick", xml.xml);
    }

    @Test
    public void testNonCSV() throws Exception {
        byte[] bytes = "testcsv\ntestcsv testcsv;;; testcsv".getBytes(StandardCharsets.UTF_8);
        Metadata metadata = new Metadata();
        metadata.set("resourceName", "test.csv");
        assertContains("text/plain", getXML(new ByteArrayInputStream(bytes), PARSER, metadata).metadata.get("Content-Type"));
        metadata.set("resourceName", "test.txt");
        assertContains("text/plain", getXML(new ByteArrayInputStream(bytes), PARSER, metadata).metadata.get("Content-Type"));
    }

    @Test
    public void testLong() throws Exception {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < 1000; i++) {
            for (int i2 = 0; i2 < 10; i2++) {
                sb.append("2").append(",");
            }
            sb.append("\n");
        }
        assertMediaTypeEquals("csv", "ISO-8859-1", "comma", getXML(new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8)), PARSER, new Metadata()).metadata.get("Content-Type"));
    }

    @Test
    public void testSubclassingMimeTypesRemain() throws Exception {
        Assertions.assertEquals("text/x-vcalendar; charset=ISO-8859-1", getXML("testVCalendar.vcs").metadata.get("Content-Type"));
    }

    @Test
    public void testCustomizingDelimiter() throws Exception {
        InputStream resourceAsStream = TextAndCSVParserTest.class.getResourceAsStream("/test-configs/tika-config-colon-delimiter.xml");
        try {
            TikaConfig tikaConfig = new TikaConfig(resourceAsStream);
            if (resourceAsStream != null) {
                resourceAsStream.close();
            }
            TikaTest.XMLResult xml = getXML("testColonDelimited.txt", new AutoDetectParser(tikaConfig));
            Assertions.assertEquals("colon", xml.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
            assertContains("colon", xml.metadata.get("Content-Type"));
        } catch (Throwable th) {
            if (resourceAsStream != null) {
                try {
                    resourceAsStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    private void assertContainsIgnoreWhiteSpaceDiffs(String str, String str2) {
        assertContains(str, str2.replaceAll("[\r\n\t ]", " "));
    }
}
