<?php
//
// 국립국어원 표준국어대사전 클리핑 프로그램
//
// 이 프로그램은 누구나 수정할 수 있지만 배포는 금지합니다.
// 이 프로그램에 대한 어떠한 질문도 받지 않습니다.
// 따라서 이 프로그램을 실행해서 잘 동작하면 웃고, 그렇지 않으면 이 프로그램을 바로 지우기 바랍니다.
// 
// 판올림도 없고 버그 수정도 없습니다.
// 이 프로그램을 사용함으로 발생하는 모든 책임은 본인 스스로에게 있습니다.
//
// $min, $max로 클리핑 범위를 지정
// 프로그램과 같은 폴더에 korea.$max.txt 형태의 파일로 데이타가 클리핑됨
// 이미지가 있으면 자동으로 이미지를 내려받음
// 이미지 뿐만 아니라 본문에 포함된 한자, 고어, 수식등 모든 이미지를 처리
//
set_time_limit(0);
include_once(
'simple_html_dom.php');
$min=1;
$max=518063;

function 
getQuery($host_ip$port$query) {
    global 
$nuke_url;
    
$hostname ereg_replace("^http://([^/]*)[/]*""\\1"$nuke_url);
    
$referer "http://".$_SERVER["HTTP_HOST"].$_SERVER["REQUEST_URI"];
    
$fp = @fsockopen($host_ip$port,  &$errno, &$errstr10);
    if(!
$fp) {
        echo 
"$errstr: $errno <br>\n";
    }else {
        @
fwrite($fp"GET $query HTTP/1.0\r\nHost: $hostname\r\nUser-Agent: DoA/1.1\nReferer: $referer\nConnection: Close\r\n\r\n");
        while(!@
feof($fp)) {
            
$list .= @fgets($fp1024);
        }
    }
    @
fclose($fp); 

    list(
$header$body) = preg_split("/\r\n\r\n/"$list2);
    return 
$body;
}

function 
getDict($str_html) {
    
$str_html str_replace('‘<’''‘<’',$str_html);
    
$str_html str_replace('‘>’''‘>’',$str_html);
    
$str_html strip_tags($str_html'<div><ul><li><img><span><td><br>');
    
$pattern=array('sword''provtitle','exp''sdblue''NumRG''NumRG2''NumNO''Use_icon''idiom_list''prov''idiom');
    
$replace=array(
        
'font-family:AppleGothic, Sans-serif ; font-size:14px; font-weight:bold; color:#336699'
        
''
        
'font-family:AppleGothic, Sans-serif;color:#000000; line-height:1.5; padding:5px;list-style: none;'
        
'font-family:AppleGothic, Sans-serif; font-size:13px;color:#336699;',
        
'font-family:tahoma; font-weight:bold;color:#549606; padding-top:-10px; margin-bottom:-0.1;vertical-align:top'
        
'font-family:tahoma;  font-weight:bold;color:#336699;padding-left:8px; margin-top:-10;vertical-align:top;',
        
'width:45px; font-weight:bold;color:#cb4a00;vertical-align:top; text-align:right'
        
'color:#444444; padding-left:20px;font-size: 13px',
        
'list-style: none;padding:5px;',
        
'background:url(icon_prov.gif) no-repeat; background-position:0px 5px; padding:5px 0px 5px 30px;line-height:1.3'
        
'background:url(icon_idiom.gif) no-repeat;background-position:0px 5px;  padding:5px 0px 5px 30px; line-height:1.3');
    
$str_html str_replace($pattern$replace$str_html);
    
$htmlstr_get_html($str_html);

    foreach(
$html->find('span[id=print_area]') as $article) {

        
$item['title1'] = trim($article->find('span.word_title'0)->plaintext);
        
$pattern = array("^""-""0""1""2""3""4""5""6""7""8""9");
        
$replace = array('''''''''''''''''''''''''''');
        
$item['title1'] = str_replace($pattern$replace$item['title1']);
        
$item['title2'] = trim($article->find('td.pb10'0)->plaintext);
        
$item['use']=trim($article->find('td[id=use_title]'0)->plaintext);
        
$pattern = array("\r\n""\n""\r""\t"" ");
        
$replace = array('''''''''');
        
$item['title2'] = str_replace($pattern$replace$item['title2']);
        
$item['use'] = str_replace($pattern$replace$item['use']);
        list(
$temp$item['use'])=split(':'$item['use']);
        
$item['list1'] = trim($article->find('div.list'0)->innertext);
        if(
$article->find('div.list2'0)) {
            
$item['list1'] .= "<br>".trim($article->find('div.list2'0)->innertext);
            
$diff++;
        }
        if(
$article->find('div.list2'1)) {
            
$item['list1'] .= "<br>".trim($article->find('div.list2'1)->innertext);
            
$diff++;
        }

        
$pattern = array("\r\n""\n""\r""\t");
        
$replace = array('''''''');
        
$item['list1'] = str_replace($pattern$replace$item['list1']);
        
$item['list2'] = trim($article->find('div'1+$diff)->innertext);
        
$item['list2'] = str_replace($pattern$replace$item['list2']);
        
$item['list3'] = trim($article->find('div'2+$diff)->innertext);
        
$item['list3'] = str_replace($pattern$replace$item['list3']);
        
$item['list4'] = trim($article->find('div'3+$diff)->innertext);
        
$item['list4'] = str_replace($pattern$replace$item['list4']);
        
$item['list5'] = trim($article->find('div'4+$diff)->innertext);
        
$item['list5'] = str_replace($pattern$replace$item['list5']);
        
$item['list6'] = trim($article->find('div'5+$diff)->innertext);
        
$item['list6'] = str_replace($pattern$replace$item['list6']);
    }

    
$html->clear();
    unset(
$html);
    return 
$item;
}

function 
getHtml($num) {
    
$query '/search/View.jsp?idx='.$num;
    
$str_html getQuery('stdweb2.korean.go.kr'80$query);
    return 
$str_html;
}

function 
saveImg($url) {
    
$host=parse_url($url);
    
$img getQuery($host['host'], 80$host['path']);
    
$file=split('/'$host['path']);
    
$filename=array_pop($file);
    
$fp=fopen($filename'w');
    
fwrite($fp$img);
    
fclose($fp);
    return 
$filename;
}

function 
getImgInfo($img) {
    
$htmlstr_get_html($img);
    
$url=$html->find('img'0)->src;
    
$alt=$html->find('img'0)->alt;
    return array(
$url$alt);
}

function 
saveDict($filename$cont) {
    
$fp=fopen($filename'a');
    
fwrite($fp$cont);
    
fclose($fp);
}

header("Content-Type: text/html; charset=utf-8");

for(
$i=$min$i <= $max$i++) {
$str_html=getHtml($i);
$dict=getDict($str_html);

if(
$dict['img']!='') {
    
$dict['img']=saveImg($dict['img']);
}

if(
$dict['title1']=='') {
    
$count++;
    continue;
}
$saveDict $dict['title1']."\t<div style='font-weight: bold; font-size: 20px; color: #0000ff'>".$dict['title2']."</div>";
if(
$dict['use']!=''$saveDict .= "<br><b>◈ 활용</b>:".$dict['use']."<br>";

if(
$dict['list1']!=''){
    if(
preg_match("/<img.*/",$dict['list1'])) {
        list(
$url$alt) = getImgInfo($dict['list1']);
        
$filename=saveImg($url);
        if(
preg_match("/\.gif/is"$filename)) {
            
$dict['list1']=str_replace($url"\"$filename\""$dict['list1']);
            
$saveDict .= $dict['list1'];
        }else 
$saveDict .= "<center><img src='".$filename."' width='300'><br>".$alt."</center>";
    }else 
$saveDict .= $dict['list1'];
}

if(
$dict['list2']!=''){
    if(
preg_match("/<img.*/",$dict['list2'])) {
        list(
$url$alt) = getImgInfo($dict['list2']);
        
$filename=saveImg($url);
        if(
preg_match("/\.gif/is"$filename)) {
            
$dict['list2']=str_replace($url"\"$filename\""$dict['list2']);
            
$saveDict .= $dict['list2'];
        }else 
$saveDict .= "<center><img src='".$filename."' width='300'><br>".$alt."</center>";
    }else 
$saveDict .= "<br>".$dict['list2'];
}

if(
$dict['list3']!=''){
    if(
preg_match("/<img.*/",$dict['list3'])) {
        list(
$url$alt) = getImgInfo($dict['list3']);
        
$filename=saveImg($url);
        if(
preg_match("/\.gif/is"$filename)) {
            
$dict['list3']=str_replace($url"\"$filename\""$dict['list3']);
            
$saveDict .= $dict['list3'];
        }else 
$saveDict .= "<center><img src='".$filename."' width='300'><br>".$alt."</center>";
    }else 
$saveDict .= "<br>".$dict['list3'];
}

if(
$dict['list4']!=''){
    if(
preg_match("/<img.*/",$dict['list4'])) {
        list(
$url$alt) = getImgInfo($dict['list4']);
        
$filename=saveImg($url);
        if(
preg_match("/\.gif/is"$filename)) {
            
$dict['list4']=str_replace($url"\"$filename\""$dict['list4']);
            
$saveDict .= $dict['list4'];
        }else 
$saveDict .= "<center><img src='".$filename."' width='300'><br>".$alt."</center>";
    }else 
$saveDict .= "<br>".$dict['list4'];
}

if(
$dict['list5']!=''){
    if(
preg_match("/<img.*/",$dict['list5'])) {
        list(
$url$alt) = getImgInfo($dict['list5']);
        
$filename=saveImg($url);
        if(
preg_match("/\.gif/is"$filename)) {
            
$dict['list5']=str_replace($url"\"$filename\""$dict['list5']);
            
$saveDict .= $dict['list5'];
        }else 
$saveDict .= "<center><img src='".$filename."' width='300'><br>".$alt."</center>";
    }else 
$saveDict .= "<br>".$dict['list5'];
}

if(
$dict['list6']!=''){
    if(
preg_match("/<img.*/",$dict['list6'])) {
        list(
$url$alt) = getImgInfo($dict['list6']);
        
$filename=saveImg($url);
        if(
preg_match("/\.gif/is"$filename)) {
            
$dict['list6']=str_replace($url"\"$filename\""$dict['list6']);
            
$saveDict .= $dict['list6'];
        }else 
$saveDict .= "<center><img src='".$filename."' width='300'><br>".$alt."</center>";
    }else 
$saveDict .= "<br>".$dict['list6'];
}

$saveDict=str_replace('class=''style='$saveDict);
$saveDict=str_replace('id="idiom_list"''style="list-style: none;"'$saveDict);
$saveDict=str_replace('id="''style="'$saveDict);
$saveDict=preg_replace("/<span style=\"Definition\">(.*?)<\/span>/is""$1"$saveDict);
$saveDict=preg_replace("/<ul style=\"\">(.*?)<\/ul>/is""<span style='font-size: 20px; color: #CC00FF; font-weight: bold;'>$1</span>"$saveDict);
$saveDict=preg_replace("/<span style=\"color:#444444; padding-left:20px;font-size: 13px\">¶<\/span> *<span style=\"Use\">(.*?)<\/span>/is""<div style='color:#444444; padding-left: 20px; font-size: 12px; line-height: 120%'>¶ $1</div>"$saveDict);
    
saveDict('korea'.$max.'.txt'$saveDict."\n");
    
$time=date("H:i:s");
    echo 
"<br>$i : $count : $time -".$saveDict;
}
?>