changeset 564: |
7f0505f8e891 |
parent 563: |
ccfdb0fb8c2f |
child 565: |
a370464b995b |
author: |
uriel@vm41.cat-v.org |
date: |
Thu, 30 Jul 2009 02:12:06 +0200 |
files: |
bin/werclib.rc |
description: |
Much better (I hope) get_html_title implementation, first try to find <title>, if that fails, get the first non-tag string in the file. |
1.1--- a/bin/werclib.rc Wed Jul 29 01:41:18 2009 +0200
1.2+++ b/bin/werclib.rc Thu Jul 30 02:12:06 2009 +0200
1.3@@ -98,12 +98,13 @@
1.4 }
1.5
1.6 fn get_html_title {
1.7- # H1 is not reliable because htmlroff doesn't use it :(
1.8- #desc=`{cat $1 | sed 32q | grep '<[Hh]1>' | sed 's/<[Hh]1>(.*)(<\/[Hh]1>|$)/\1/;s/<[^>]*>//g;1q'}
1.9- # Pick the first line of body instead
1.10- desc=`{sed -n '/<[Bb][Oo][Dd][Yy]/,/./s/(<[^>]*>|$)//gp' < $1}
1.11- if(~ $#desc 0)
1.12- desc=`{sed 's/<[^>]*>//g; 1q' < $1}
1.13+ t=`{sed -n '32q; s/^.*<[Tt][Ii][Tt][Ll][Ee]> *([^<]+) *(<\/[Tt][Ii][Tt][Ll][Ee]>.*)?$/\1/p' < $1}
1.14+
1.15+ # As a backup we might want to pick the first 'non-tag' text in the file with:
1.16+ if(~ $"t '')
1.17+ t=`{sed -n -e 's/^(<[^>]+>)*([^<]+).*/\2/p; 32q' < $1 | sed 1q}
1.18+
1.19+ echo $t
1.20 }
1.21
1.22 fn get_file_title {