#!/usr/bin/perl # はてなダイアリーの csv を XHatenaML(仮) に変換する。 #BUGS: # - 自動リンク関係未対応。 # - pre 関係未対応。 # - 定義リスト(dl/dt/dd) 関係未対応。 # - 脚注関係未対応。 # - img, br, hr 等を /> で閉じてない。 # - 特殊文字の自動エスケープしてない(今のところ p 内の & だけ)。 # - タグによって自動 p 変換しない機能に未対応。 # - >< による自動 p 変換 on/off がはてなと非互換(不正な?使用法の場合)。 # # - サーバで抽出されたキーワードは変換しない。 # - サーバが記録した del,ins の datetime は変換しない。 # - コメントを処理しない。 # - 添付画像を処理しない。 use strict; my $version = "0.0"; my $usage = "csv2xml4hatena.pl [options] file\n" . " -row m[-n]: 指定された行の範囲を出力。\n" . " -day YYYY-MM-DD: 指定日付のみ出力。\n" . " -new YYYY-MM-DD: 指定日付より新しい日付のみ出力。\n" . " -t: 入力ファイルは(CSV形式ではなく)テキスト形式(1日分)。\n" . " -e: 入力エンコーディングの指定\n" ; # オプション用 my @row_range = (); my $day = ""; my $new = ""; my $text_mode = 0; my $encoding = "shift_jis"; while ($ARGV[0] =~ /^-/) { $_ = shift @ARGV; if (/^-row$/) { @row_range = &get_range(shift @ARGV); } elsif (/^-day$/) { $day = shift @ARGV; } elsif (/^-new$/) { $new = shift @ARGV; } elsif(/^-t$/) { $text_mode = 1; } elsif(/^-e$/) { $encoding = shift @ARGV; } elsif(/^-/) { &usage("unknown option $_"); } } &print_header(); if ($text_mode) { &parse_text(); } else { &parse_csv(); } &print_footer(); exit; ########################################################################## ## オプション関係 sub get_range() { my($exp) = @_; $exp =~ /^([0-9]+)(?:-([0-9]+))?$/; my @range = ($1, $2); &usage("error: invalid range expression.") if ($1 eq ""); if ($2 eq "") { $range[1] = $range[0]; } &usage("error: invalid range expression.") if($range[1] < $range[0]); return @range; } sub usage() { print $_[0] . "\n"; print $usage; exit(1); } ## ヘッダ・フッタ sub print_header() { print <<_XML_; _XML_ ; } sub print_footer() { print <<_XML_; _XML_ ; } ########################################################################## ## 日記の解析・変換 # インデント用空白文字列 my $indent = ""; # 日記の解析 sub handle_record() { my ($date, $title, $body, $comment, $text) = @_; #print "$date\n"; print "\n"; &indent(); print $indent . "\n"; if ($title ne "") { &indent(); print $indent . "$title\n"; &unindent(); } &handle_sections($text); print $indent . "\n"; &unindent(); } # セクション解析用コンテクスト変数 my $annon_sect; # 匿名セクションフラグ my @list_stack; # ul/ol/li のネスト履歴スタック my $xl_level; # ul/ol のネストレベル my $dlist; # 定義リストフラグ my $auto_p; # 自動的に p にするフラグ my $super_pre; # super pre 内 sub handle_sections() { my ($text) = @_; $annon_sect = 1; @list_stack = (); $xl_level = 0; $dlist = 0; $auto_p = 1; $super_pre = 0; foreach (split(/\x0D\x0A|[\x0D\x0A]/, $text)) { if (!/^[-+]+[^-+]/) { # リスト以外 &clear_list_context(); } if (/^><.+/) { # 自動 p 抑止 $auto_p = 0; s/^>//; # > を消す } if (/^\*((?:\[[^\[\]]+\])*)\s*([^\[].*)$/) { # セクション my $cat = $1; my $title = $2; if ($annon_sect) { $annon_sect = 0; } else { &clear_section_context(); } &start_section($cat, $title); } elsif (/^([-+]+)([^-+].*)$/) { # リスト my $lv = $1; my $content = $2; my $lvdiff = length($lv) - $xl_level; $lv = substr($lv, $xl_level); if ($lvdiff <= 0) { &clear_list_context(-$lvdiff); } else { foreach (1..$lvdiff) { &start_list(substr($lv, $_-1, 1)); } } &start_li($content); } elsif (/^>>/) { # 引用開始(>>) &start_bq(); } elsif (/^<\|\|/) { # super pre 開始(>||) &start_spre(); $super_pre = 1; } elsif (/^\|\|<$/) { # 自動 p 抑止を解除 $auto_p = 1; s/<$//; } &indent(); print $indent . "$_\n"; &unindent(); } } } &clear_list_context(); &clear_section_context(); } sub handle_p() { my ($p) = @_; return if($p eq ""); $p =~ s/&/&/g; &indent(); print $indent . "

"; print "$p"; print "

\n"; &unindent(); } ## セクション関係 sub start_section() { my ($cat, $title) = @_; &indent(); print "$indent

\n"; &indent(); print $indent . "$title\n"; &handle_categories($cat) if ($cat ne ""); print "\n"; &unindent(); } sub end_section() { print $indent . "

\n"; &unindent(); } sub clear_section_context() { if (!$annon_sect) { &end_section(); } } sub handle_categories() { my ($cat) = @_; chop $cat; $cat =~ s/\[//g; my @cats = split(/\]/, $cat); print $indent; foreach (@cats) { print "$_"; } print "\n"; } ## リスト関係 sub in_li() { return $list_stack[$#list_stack] eq "li"; } sub start_list() { my ($c) = @_; my $xl = ($c eq "+") ? "ol" : "ul"; print "\n" if (&in_li()); push @list_stack, $xl; $xl_level++; &indent(); print $indent . "<$xl>\n"; } sub end_list() { my $xl = pop @list_stack; print $indent . "\n"; &unindent(); $xl_level--; } sub clear_list_context() { my $level = $xl_level; return if($level == 0); ($level) = @_ if (1 == @_); &end_li("") if(&in_li()); foreach (1..$level) { &end_list(); &end_li($indent) if(&in_li()); } } sub start_li() { my ($content) = @_; push @list_stack, "li"; &indent(); $content =~ s/&/&/g; print $indent . "

$content"; } sub end_li() { my ($indent) = @_; print $indent . "

\n"; &unindent(); pop @list_stack; } ## 引用関係 sub start_bq() { my ($cite, $title) = @_; &indent(); print $indent . "\n"; } sub end_bq() { print $indent . "\n"; &unindent(); } ## super pre sub start_spre() { print $indent . "

\n"; } ## インデント sub indent() { $indent .= " "; } sub unindent() { $indent =~ s/ $//; } ## テキスト用 sub parse_text() { my $text = ""; while (<>) { $text .= $_; } &handle_record("0001-01-01", "(untitled)", "", "", $text); } ## CSV パーサ sub parse_csv() { for(my $row=0; my $line = <>; $row++) { my ($date) = $line =~ /^([^,]+),/; while (($line =~ tr/\"//) % 2 and !eof()) { # 複数行レコード読み込み $line .= <>; } if ($row == 0) { next; } elsif (@row_range == 2) { next if ($row < $row_range[0]); last if ($row_range[1] < $row); } elsif ($day ne "") { next if ($date ne $day); last if ($date lt $day); } elsif ($new ne "") { last if ($date le $new); } chop ($line); $line =~ s/[\x0D]$//; $line .= ","; my @record = map { s/^\"|\"$//g; s/\"\"/\"/g; $_; } ($line =~ /(\"[^\"]*(?:\"\"[^\"]*)*\"|[^,]*),/g); &handle_record(@record); } }