본문 바로가기
IT/java

함수형 프로그래밍 with 자바 - 단어 카운트

by 가능성1g 2024. 6. 18.
반응형

 

package kr.samdogs.study.func.pojo;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Map;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class LineCount {

	public static void main(String[] args) {
		/* 단어카운트 함수형 자바 */
		
		Path location = Paths.get("LineCount.java");
		
		//구두점 문구 패턴
		Pattern punctuation = Pattern.compile("\\p{Punct}");
		Pattern whitespace  = Pattern.compile("\\s+");
		Pattern words       = Pattern.compile("\\w+");
		
		try(Stream<String> stream = Files.lines(location)){
			
			Map<String, Integer> wordCount = 
					stream.map(punctuation::matcher)
					.map(matcher -> matcher.replaceAll(""))  //구두점 삭제
					.map(whitespace::split)
					.flatMap(Arrays::stream)  // 공백 기준 단어 분할
					.filter(word -> words.matcher(word).matches()) //같은단어끼리 모으기??
					.map(String::toLowerCase) //표준화(모두소문자)
					.collect(Collectors.toMap(Function.identity(),  //단어세기
							word -> 1,
							Integer::sum));
			System.out.println(wordCount);
			
		}catch(IOException e) {
			System.err.print(e.getMessage());			
		}

	}

}

 

단계별로 보기 위해서 는 중간중간 컬렉팅 하는 걸로 찍어 보면 된다!

 

1. 최초에 stream 으로 바꾸면 라인 단위로 들어오게 된다.

System.out.println(
    stream.collect(Collectors.toList()).toString()
);
//출력
[package kr.samdogs.study.func.pojo;, , import java.io.IOException;, import java.nio.file.Files;, import java.nio.file.Path;, import java.nio.file.Paths;, import java.util.Arrays;, import java.util.Map;, import java.util.function.Function;, import java.util.regex.Pattern;, import java.util.stream.Collectors;, import java.util.stream.Stream;, , public class LineCount {, , 	public static void main(String[] args) {, 		/* 단어카운트 함수형 자바 */, 		, 		Path location = Paths.get("LineCount.java");, 		, 		//구두점 문구 패턴, 		Pattern punctuation = Pattern.compile("\\p{Punct}");, 		Pattern whitespace  = Pattern.compile("\\s+");, 		Pattern words       = Pattern.compile("\\w+");, 		, 		try(Stream<String> stream = Files.lines(location)){, 			, 			Map<String, Integer> wordCount = , 					stream.map(punctuation::matcher), 					.map(matcher -> matcher.replaceAll(""))  //구두점 삭제, 					.map(whitespace::split), 					.flatMap(Arrays::stream)  // 공백 기준 단어 분할, 					.filter(word -> words.matcher(word).matches()) //같은단어끼리 모으기??, 					.map(String::toLowerCase) //표준화(모두소문자), 					.collect(Collectors.toMap(Function.identity(),  //단어세기, 							word -> 1,, 							Integer::sum));, 			System.out.println(wordCount);, 		}catch(IOException e) {, 			System.err.print(e.getMessage());			, 		}, , 	}, , }]

 

2. 이걸 구두점(,;. 등등) 들을 모두 삭제

System.out.println(
		stream.map(punctuation::matcher)
		.map(matcher -> matcher.replaceAll(""))
		.collect(Collectors.toList()).toString()
);
//출력
[package krsamdogsstudyfuncpojo, , import javaioIOException, import javaniofileFiles, import javaniofilePath, import javaniofilePaths, import javautilArrays, import javautilMap, import javautilfunctionFunction, import javautilregexPattern, import javautilstreamCollectors, import javautilstreamStream, , public class LineCount , , 	public static void mainString args , 		 단어카운트 함수형 자바 , 		, 		Path location  PathsgetLineCountjava, 		, 		구두점 문구 패턴, 		Pattern punctuation  PatterncompilepPunct, 		Pattern whitespace   Patterncompiles, 		Pattern words        Patterncompilew, 		, 		tryStreamString stream  Fileslineslocation, 			, 			MapString Integer wordCount  , 					streammappunctuationmatcher, 					mapmatcher  matcherreplaceAll  구두점 삭제, 					mapwhitespacesplit, 					flatMapArraysstream   공백 기준 단어 분할, 					filterword  wordsmatcherwordmatches 같은단어끼리 모으기, 					mapStringtoLowerCase 표준화모두소문자, 					collectCollectorstoMapFunctionidentity  단어세기, 							word  1, 							Integersum, 			SystemoutprintlnwordCount, 		catchIOException e , 			SystemerrprintegetMessage			, 		, , 	, , ]

 

3. 그리고 공백 기준으로 스플릿! 하지만 이렇게만 하면 각각의 배열안에 참조로 바뀌므로 flatMap 을이용해서 2depth 배열을 1depth 로 바꿔준다.

			System.out.println(
					stream.map(punctuation::matcher)
					.map(matcher -> matcher.replaceAll(""))
					.map(whitespace::split)
					.collect(Collectors.toList()).toString()
			);
//출력-스플릿은 됐지만 단어가 분리되어 배열참조로 되었다.
[[Ljava.lang.String;@568db2f2, [Ljava.lang.String;@378bf509, [Ljava.lang.String;@5fd0d5ae, [Ljava.lang.String;@2d98a335, [Ljava.lang.String;@16b98e56, [Ljava.lang.String;@7ef20235, [Ljava.lang.String;@27d6c5e0, [Ljava.lang.String;@4f3f5b24, [Ljava.lang.String;@15aeb7ab, [Ljava.lang.String;@7b23ec81, [Ljava.lang.String;@6acbcfc0, [Ljava.lang.String;@5f184fc6, [Ljava.lang.String;@3feba861, [Ljava.lang.String;@5b480cf9, [Ljava.lang.String;@6f496d9f, [Ljava.lang.String;@723279cf, [Ljava.lang.String;@10f87f48, [Ljava.lang.String;@b4c966a, [Ljava.lang.String;@2f4d3709, [Ljava.lang.String;@4e50df2e, [Ljava.lang.String;@1d81eb93, [Ljava.lang.String;@7291c18f, [Ljava.lang.String;@34a245ab, [Ljava.lang.String;@7cc355be, [Ljava.lang.String;@6e8cf4c6, [Ljava.lang.String;@12edcd21, [Ljava.lang.String;@34c45dca, [Ljava.lang.String;@52cc8049, [Ljava.lang.String;@5b6f7412, [Ljava.lang.String;@27973e9b, [Ljava.lang.String;@312b1dae, [Ljava.lang.String;@7530d0a, [Ljava.lang.String;@27bc2616, [Ljava.lang.String;@3941a79c, [Ljava.lang.String;@506e1b77, [Ljava.lang.String;@4fca772d, [Ljava.lang.String;@9807454, [Ljava.lang.String;@3d494fbf, [Ljava.lang.String;@1ddc4ec2, [Ljava.lang.String;@133314b, [Ljava.lang.String;@b1bc7ed, [Ljava.lang.String;@7cd84586, [Ljava.lang.String;@30dae81, [Ljava.lang.String;@1b2c6ec2, [Ljava.lang.String;@4edde6e5]

			System.out.println(
					stream.map(punctuation::matcher)
					.map(matcher -> matcher.replaceAll(""))
					.map(whitespace::split)
					.flatMap(Arrays::stream)  // 2depth -> 1depth
					.collect(Collectors.toList()).toString()
			);
//출력-flatMap 을 통해 평탄?화
[package, krsamdogsstudyfuncpojo, , import, javaioIOException, import, javaniofileFiles, import, javaniofilePath, import, javaniofilePaths, import, javautilArrays, import, javautilMap, import, javautilfunctionFunction, import, javautilregexPattern, import, javautilstreamCollectors, import, javautilstreamStream, , public, class, LineCount, , , public, static, void, mainString, args, , 단어카운트, 함수형, 자바, , Path, location, PathsgetLineCountjava, , 구두점, 문구, 패턴, , Pattern, punctuation, PatterncompilepPunct, , Pattern, whitespace, Patterncompiles, , Pattern, words, Patterncompilew, , tryStreamString, stream, Fileslineslocation, , MapString, Integer, wordCount, , streammappunctuationmatcher, , mapmatcher, matcherreplaceAll, 구두점, 삭제, , mapwhitespacesplit, , flatMapArraysstream, 공백, 기준, 단어, 분할, , filterword, wordsmatcherwordmatches, 같은단어끼리, 모으기, , mapStringtoLowerCase, 표준화모두소문자, , collectCollectorstoMapFunctionidentity, 단어세기, , word, 1, , Integersum, , SystemoutprintlnwordCount, , catchIOException, e, , SystemerrprintegetMessage, , , ]

 

4. 영어단어만 모으고, 표준화(소문자화)를 한후, 단어를 key 로 바꾸어서 단어카운트 맵을 완성한다.

			System.out.println(
					stream.map(punctuation::matcher)
					.map(matcher -> matcher.replaceAll(""))
					.map(whitespace::split)
					.flatMap(Arrays::stream)
					.filter(word -> words.matcher(word).matches()) //같은단어끼리 모으기??
					.map(String::toLowerCase) //표준화(모두소문자)					
					.collect(Collectors.toMap(Function.identity(),
							word -> 1,
							Integer::sum))
			);
//출력
{wordcount=1, mapmatcher=1, integer=1, javaniofilepath=1, wordsmatcherwordmatches=1, path=1, catchioexception=1, integersum=1, javaioioexception=1, trystreamstring=1, void=1, static=1, package=1, javautilstreamcollectors=1, patterncompiles=1, javautilmap=1, 1=1, linecount=1, patterncompilew=1, javautilfunctionfunction=1, patterncompileppunct=1, word=1, krsamdogsstudyfuncpojo=1, streammappunctuationmatcher=1, import=10, javaniofilefiles=1, javautilregexpattern=1, pattern=3, javautilarrays=1, public=2, stream=1, javaniofilepaths=1, pathsgetlinecountjava=1, systemoutprintlnwordcount=1, flatmaparraysstream=1, mainstring=1, class=1, e=1, mapwhitespacesplit=1, words=1, mapstring=1, fileslineslocation=1, matcherreplaceall=1, args=1, systemerrprintegetmessage=1, javautilstreamstream=1, collectcollectorstomapfunctionidentity=1, punctuation=1, mapstringtolowercase=1, location=1, filterword=1, whitespace=1}

 

이렇게 사고하는게 쉽진 않은데, 이해만 한다면, 또 깔끔한 코드라고 생각된다~~!

노력노력!

반응형