byte 배열에서 charset 정보 detecting 하기

2023년 08월 04일

알쓸개잡
작성자
2023.08.04.:55

간혹 바이트 배열의 문자열로부터 인코딩 정보를 알 수 없어 String 타입으로 디코딩 할때 문자열이 깨지는 경우가 있다.

이 경우 charset detector 를 사용하여 해당 바이트 배열로 부터 인코딩 정보를 알 수 있다. 인코딩 정보를 알면 해당 인코딩 정보를 기반으로 디코딩을 하면 되기 때문에 문자열이 깨지는 경우를 어느정도 개선할 수 있다.

juniversal charset detector 와 icu4j 사용법에 대한 샘플 코드는 아래와 같다.

Juniversal charset detector

dependency

<dependency>
    <groupId>com.googlecode.juniversalchardet</groupId>
    <artifactId>juniversalchardet</artifactId>
    <version>1.0.3</version>
</dependency>

package com.example.charset.detect;

import org.mozilla.universalchardet.UniversalDetector;

public class JuniversalChardetExample {

	public String charsetDetect(byte[] source) {
		UniversalDetector detector = 
			new UniversalDetector(
            	//생성자 인자로 lambda 함수를 정의할 수 있다.
                //lambda 함수에는 detecting 된 charset 정보가 인자로 전달된다.
                //혹은 null 로 지정할 수 있다. new UniversalDetector(null)
				charset -> System.out.println("detected charset = " + charset)
			);
		detector.handleData(source,0, source.length);
		detector.dataEnd();

		return detector.getDetectedCharset();
	}
}

Icu4j

dependency

<dependency>
    <groupId>com.ibm.icu</groupId>
    <artifactId>icu4j</artifactId>
    <version>73.2</version>
</dependency>

package com.example.charset.detect;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

public class Icu4jExample {

	public String charsetDetect(byte[] source) {
		CharsetDetector detector = new CharsetDetector();
		CharsetMatch match;
		detector.setText(source);
		match = detector.detect();
		return match.getName();
	}
}

테스트코드

package com.example.charset.detect;

import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

import java.nio.charset.Charset;

@SpringBootApplication
public class CharsetdetectApplication implements CommandLineRunner {

	public static void main(String[] args) {
		SpringApplication.run(CharsetdetectApplication.class, args);
	}

	@Override
	public void run(String... args) throws Exception {
		System.out.println("default charset: " + Charset.defaultCharset());

		String string1 = "이것은 한글입니다. 인코딩은 UTF-8 입니다.";
		String string2 = "이것은 한글입니다. 인코딩은 EUC-KR 입니다.";
		//일본어 샘플
		String shiftJis = "Google の使命は、Google 独自の検索エンジンにより、世界中の情報を体系化し、アクセス可能で有益なものにすることです。";

		JuniversalChardetExample juniversalChardetExample = new JuniversalChardetExample();
		Icu4jExample icu4jExample = new Icu4jExample();

		//default charset 로 encoding 함 (file.encoding 시스템 프로퍼티)
		String string1Charset = juniversalChardetExample.charsetDetect(string1.getBytes());
		System.out.println("==== juniversal charset ====");
		System.out.println("string1 charset: " + string1Charset);
		System.out.println("string1 decoded: " + new String(string1.getBytes(), string1Charset));
		System.out.println("==== juniversal charset ====");
		System.out.println();

		string1Charset = icu4jExample.charsetDetect(string1.getBytes());
		System.out.println("==== icu4j charset ====");
		System.out.println("string1 charset: " + string1Charset);
		System.out.println("string1 decoded: " + new String(string1.getBytes(), string1Charset));
		System.out.println("==== icu4j charset ====");
		System.out.println();

		//EUC-KR 로 encoding 함
		String string2Charset = juniversalChardetExample.charsetDetect(string2.getBytes("EUC-KR"));
		System.out.println("==== juniversal charset ====");
		System.out.println("string2 charset: " + string2Charset);
		System.out.println("string2 decoded: " + new String(string2.getBytes("EUC-KR"), string2Charset));
		System.out.println("==== juniversal charset ====");
		System.out.println();

		string2Charset = icu4jExample.charsetDetect(string2.getBytes("EUC-KR"));
		System.out.println("==== icu4j charset ====");
		System.out.println("string2 charset: " + string2Charset);
		System.out.println("string2 decoded: " + new String(string2.getBytes("EUC-KR"), string2Charset));
		System.out.println("==== icu4j charset ====");
		System.out.println();

		String shiftJisCharset = juniversalChardetExample.charsetDetect(shiftJis.getBytes());
		System.out.println("==== juniversal charset ====");
		System.out.println("shiftjis charset: " + shiftJisCharset);
		System.out.println("shiftjis decoded: " + new String(shiftJis.getBytes(), shiftJisCharset));
		System.out.println("==== juniversal charset ====");
		System.out.println();

		shiftJisCharset = icu4jExample.charsetDetect(shiftJis.getBytes());
		System.out.println("==== icu4j charset ====");
		System.out.println("shiftjis charset: " + shiftJisCharset);
		System.out.println("shiftjis decoded: " + new String(shiftJis.getBytes(), shiftJisCharset));
		System.out.println("==== icu4j charset ====");
		System.out.println();

		//SHIFT-JIS 로 encoding 함
		shiftJisCharset = juniversalChardetExample.charsetDetect(shiftJis.getBytes("SHIFT-JIS"));
		System.out.println("==== juniversal charset ====");
		System.out.println("shiftjis charset: " + shiftJisCharset);
		System.out.println("shiftjis decoded: " + new String(shiftJis.getBytes("SHIFT-JIS"), shiftJisCharset));
		System.out.println("==== juniversal charset ====");
		System.out.println();

		shiftJisCharset = icu4jExample.charsetDetect(shiftJis.getBytes("SHIFT-JIS"));
		System.out.println("==== icu4j charset ====");
		System.out.println("shiftjis charset: " + shiftJisCharset);
		System.out.println("shiftjis decoded: " + new String(shiftJis.getBytes("SHIFT-JIS"), shiftJisCharset));
		System.out.println("==== icu4j charset ====");
		System.out.println();
	}
}

URL 인코딩된 데이터 디코딩 하기

URL 인코딩된 데이터의 인코딩 정보가 없는 경우에는 제대로 디코딩을 할 수 없다. 이 경우 charset detector 를 활용하면 도움이 될 것 같다.

//shift-jis 로 인코딩된 url encoding 된 데이터. shift-jis 인코딩 정보를 모르는 경우 아래와 같이 처리한다.
String urlEncoded = "Google%20%82%CC%8Eg%96%BD%82%CD%81AGoogle%20%93%C6%8E%A9%82%CC" +
    "%8C%9F%8D%F5%83G%83%93%83W%83%93%82%C9%82%E6%82%E8%81A%90%A2%8AE%92%86%82%CC%8F%EE" +
    "%95%F1%82%F0%91%CC%8Cn%89%BB%82%B5%81A%83A%83N%83Z%83X%89%C2%94%5C%82%C5%97L%89v%82" +
    "%C8%82%E0%82%CC%82%C9%82%B7%82%E9%82%B1%82%C6%82%C5%82%B7%81B";

//URL 인코딩된 데이터를 ISO_8859_1 문자셋으로 URL 디코딩을 한다.
String urlDecoded = URLDecoder.decode(urlEncoded, StandardCharsets.ISO_8859_1);
String urlEncodedCharset = juniversalChardetExample.charsetDetect(urlDecoded.getBytes(StandardCharsets.ISO_8859_1));
System.out.println("==== juniversal charset ====");
System.out.println("urlEncoded charset: " + urlEncodedCharset);
System.out.println("urlEncoded decoded: " + new String(urlDecoded.getBytes(StandardCharsets.ISO_8859_1), urlEncodedCharset));
System.out.println("==== juniversal charset ====");
System.out.println();

저작자표시 비영리 변경금지 (새창열림)

'자바' 카테고리의 다른 글

jdk pattern matching for instanceof (0)	2023.08.26
java switch expression - from jdk 14 (0)	2023.08.20
java Array vs ArrayList (0)	2023.08.19
java record 용법 - from jdk 14 (0)	2023.08.19
CompletableFuture 를 알아보자 (0)	2023.08.06

다음 글이 없습니다.

이전 글이 없습니다.

Juniversal charset detector

Icu4j

테스트코드

'자바' 카테고리의 다른 글

티스토리툴바