ripnews/trunk/ripnews/news/article.rb
2003-04-28 20:50:43 +00:00

652 lines
18 KiB
Ruby

#################################
#
# $Dwarf: article.rb,v 1.66 2003/04/28 20:34:15 ward Exp $
# $Source$
#
# article.rb
#
# (C) 2002, Ward Wouts
#
#################################
require 'set/intspan'
require 'net/nntp'
require 'news/newsrc'
require 'tempfile'
require 'timeout'
class ArticleError < RuntimeError; end
class TempError < ArticleError; end
class PermError < ArticleError; end
class Article
Debuglevel = 0
def initialize(nntpservers, groupname, newsrc="~/.newsrc")
@messids = []
@ids = []
@servers = []
@subjects = []
@sorted = false
@grouped = false
@groups = {}
@gotten = {}
@group = groupname
@serverlist = nntpservers.split('|')
@connections = {}
@serverlist.collect{|server|
@connections[server] = {}
begin
@connections[server]["nntp"] = Net::NNTP.new(server)
@connections[server]["skip_ids"] = Set::IntSpan.new()
@connections[server]["newsrc"] = News::Newsrc.new("#{newsrc}.#{server}")
set_skip_ids(server, @connections[server]["newsrc"].marked_articles(@group))
rescue SocketError, Errno::EINVAL
print "Connection to #{server} failed: #{$!}\n"
del_server(server)
end
}
end
def reconnect(server)
begin
@connections[server]["nntp"] = Net::NNTP.new(server)
rescue SocketError
print "Reconnect to #{server} failed: #{$!}\n"
del_server(server)
raise PermError, "Couldn't connect to #{server}"
end
print "Succesfully reconnected to #{server}\n"
end
def add(messid, id, server, subject)
# print "Messid: #{messid}\n"
# print "Id: #{id}\n"
# print "Server: #{server}\n"
# print "Subject: #{subject}\n"
@messids.push(messid)
@ids.push(id.to_i)
@servers.push(server)
@subjects.push(subject)
@sorted = false
@grouped = false
end
def del_server(server)
print "Removing server #{server} from list\n"
@connections.delete(server)
@serverlist.delete(server)
end
def get_articles(cachedir=false)
for server in @connections.keys
begin
first, last = get_group_info(server)
rescue PermError
print "Error: #{$!}\n"
del_server(server)
next
end
if first <= last
@connections[server]["first"] = first ? first : 0
@connections[server]["last"] = last ? last : 0
if Debuglevel > 0
print " Server: #{server}\n"
print " First: #{first}\n"
print " Last: #{last}\n"
end
# clean up old newsrc entries
if @connections[server]["first"].to_i > 0
@connections[server]["newsrc"].unmark_range(@group, 0, (@connections[server]["first"].to_i-1).to_s)
@connections[server]["newsrc"].save
end
else
print " First article has higher number than last article on server #{server}.\n"
del_server(server)
end
end
read_cache(cachedir)
for server in @connections.keys
print " reading articles from server: #{server}\n"
range = Set::IntSpan.new("#{@connections[server]["first"]}-#{@connections[server]["last"]}")
rangelist = rechunk_runlist(range.diff(@connections[server]["skip_ids"]).run_list)
print "rangelist: #{rangelist}\n" if Debuglevel > 2
print "rangelist: #{rangelist.type.to_s}\n" if Debuglevel > 2
print "rangelist elements: #{range.diff(@connections[server]["skip_ids"]).elements}\n" if Debuglevel >2
begin
unless rangelist == nil or rangelist =~ /^$/
for i in rangelist.split(',')
print "i: #{i}\n" if Debuglevel > 2
begin
resp, subj_lines = get_xhdr(server, i, "subject")
resp, messid_lines = get_xhdr(server, i, "message-id")
rescue TempError
print "Caught: #{$!} reading from #{server} (get_articles)\n"
next
end
art = {}
subj_lines.collect{|x|
art[x[0]] = {} unless art.has_key?(x[0])
art[x[0]]["subject"] = x[1]
print "art id: #{x[0]} subj: #{x[1]}\n" if Debuglevel > 1
}
messid_lines.collect{|x|
art[x[0]] = {} unless art.has_key?(x[0])
art[x[0]]["messid"] = x[1]
print "art id: #{x[0]} messid: #{x[1]}\n" if Debuglevel > 1
}
for id in art.keys
if art[id].has_key?("subject") and art[id].has_key?("messid")
print "adding: #{art[id]["messid"]}, #{id}, #{server}, #{art[id]["subject"]}\n" if Debuglevel > 1
add(art[id]["messid"], id, server, art[id]["subject"])
end
end
end
end
rescue PermError
print "Error: #{$!}\n"
del_server(server)
next
end
end
save_cache(cachedir)
end
def get_group_info(server)
timedout = 0
resp = ""
first = ""
last = ""
begin
timeout(30) do
begin
resp, count, first, last, name = @connections[server]["nntp"].group(@group)
rescue Net::NNTP::RuntimeError
print "Caught #{$!.type} from #{server}\n"
raise PermError, "#{$!}"
rescue Errno::EPIPE, Errno::ECONNRESET, EOFError
print "Caught #{$!.type} reading from server #{server} (get_group_info)\n"
print "Error: #{$!}\n"
reconnect(server)
retry
end
end
rescue TimeoutError
timedout += 1
raise PermError, "Too many timeouts! (get_group_info)" if timedout > 1
print "Time out, reconnecting to server... (get_group_info)\n"
reconnect(server)
retry
end
return first, last
end
def get_xhdr(server, range, header)
timedout = 0
resp = ""
lines = []
begin
timeout(180) do
begin
resp, lines = @connections[server]["nntp"].xhdr(header, range)
if resp.to_i == 500
print "xhdr not implemented\n"
print "Error: #{$!}\n"
end
unless resp.to_i >= 200 and resp.to_i < 300
print "got response #{resp} while reading group #{@group} from #{server}\n"
raise TempError
end
rescue Net::NNTP::RuntimeError
print "Caught #{$!.type} reading from server #{server} (get_xhdr)\n"
print "Error: #{$!}\n"
if ( $!.to_s =~ /^503|^400/ )
reconnect(server)
get_group_info(server)
retry
else
print "Won't handdle this... yet :(\n"
end
rescue Errno::EPIPE, Errno::ECONNRESET, EOFError
print "Caught #{$!.type} reading from server #{server} (get_xhdr)\n"
print "Error: #{$!}\n"
reconnect(server)
get_group_info(server)
retry
end
end
return resp, lines
rescue TimeoutError
print "Time out, reconnecting to server (get_xhdr)\n"
timedout += 1
raise PermError, "Too many timeouts! (get_xhrd)" if timedout > 1
reconnect(server)
get_group_info(server)
retry
end
end
# if xhdr doesn't work, this should be used
# for i in (range.diff(@connections[server]["skip_ids"]).elements)
# begin
# @connections[server]["nntp"].stat(i)
# resp, id, messid, list = @connections[server]["nntp"].head(i)
# for j in list
# if j =~ /Subject: (.*)/
# subj=$1
# end
# end
# print "get_articles messid: #{messid}\n" if Debuglevel > 1
# print "get_articles id: #{id}\n" if Debuglevel > 1
# print "get_articles server: #{server}\n" if Debuglevel > 1
# print "get_articles subject: #{subj}\n" if Debuglevel > 1
# add(messid, id, server, subj)
# rescue Net::NNTP::RuntimeError
# print "whoopsie couldn't stat #{i}\n" if Debuglevel > 1
# end
# end
def get_groups
group_subjects unless @grouped
return @groups
end
def get_groupname
return @group
end
def get_body(server, message)
timedout = 0
resp = ""
id = ""
messid = ""
list = []
retries = 0
begin
timeout(180) do
begin
resp, id, messid, list = @connections[server]["nntp"].body(message)
rescue Net::NNTPReplyError
a = ''
a += $!
print "Caught #{$!.type} reading article #{message} from #{server} (get_body)\n"
print "Error: #{$!}\n"
if retries == 0 && (a =~ /^503/ || a =~ /^400/)
reconnect(server)
get_group_info(server)
retries = 1
retry
end
return false
rescue EOFError
print "Caught #{$!.type} reading article #{message} from #{server} (get_body)\n"
print "Error: #{$!}\n"
return false
rescue Errno::EPIPE, Errno::ECONNRESET
print "Caught #{$!.type} reading from server #{server} (get_body)\n"
print "Error: #{$!}\n"
reconnect(server)
get_group_info(server)
retry
end
end
return resp, id, messid, list
rescue TimeoutError
print "Time out, reconnecting to server (get_body)\n"
timedout += 1
raise PermError, "Too many timeouts! (get_body)" if timedout > 1
reconnect(server)
get_group_info(server)
retry
end
end
def get_group_body(subj)
result = []
group_subject_sort(subj)
for i in (0...@groups[subj]["messages"].length)
unless @gotten.has_key?(@groups[subj]["messages"][i])
print "getting article: #{i}\n" if Debuglevel > 1
print "getting article: #{subj}\n" if Debuglevel > 1
print "full subject: #{@groups[subj]["subject"][i]}\n" if Debuglevel > 0
print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1
print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1
print "server: #{@groups[subj]["servers"][i]}\n" if Debuglevel > 0
resp = false
while resp == false
if @serverlist.include?(@groups[subj]["servers"][i])
resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i])
else
resp = false
end
if resp == false
if Debuglevel > 1
print "mess-id i: #{@groups[subj]["messages"][i]}\n"
print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n"
end
if (i+1 < @groups[subj]["messages"].length) and
(@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1])
print " Trying next server...\n"
i += 1
else
raise TempError, " Message-id not on another server"
end
end
end
@gotten[ @groups[subj]["messages"][i] ] = true
result = list
end
end
return result
end
def get_group_body_first(subj)
group_subject_sort(subj)
i = 0
while @gotten.has_key?(@groups[subj]["messages"][0]) == false
print "getting article: #{subj}\n" if Debuglevel > 0
print "full subject: #{@groups[subj]["subject"][0]}\n" if Debuglevel > 0
print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1
print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1
print "server: #{@groups[subj]["servers"][0]}\n" if Debuglevel > 0
resp = false
while resp == false
resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i])
if resp == false
print "mess-id i: #{@groups[subj]["messages"][i]}\n"
print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n"
if (i+1 < @groups[subj]["messages"].length) and
(@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1])
print "Trying next server...\n"
i += 1
else
raise TempError, "Message-id not on another server"
end
end
end
@gotten[@groups[subj]["messages"][i]] = true
end
return list
end
def get_group_body_rest(subj, file=nil)
result = []
for i in (1...@groups[subj]["messages"].length)
unless @gotten.has_key?(@groups[subj]["messages"][i])
print "getting article: #{i}\n" if Debuglevel > 1
print "getting article: #{subj}\n" if Debuglevel > 1
print "full subject: #{@groups[subj]["subject"][i]}\n" if Debuglevel > 0
print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1
print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1
print "server: #{@groups[subj]["servers"][i]}\n" if Debuglevel > 0
resp = false
while resp == false
resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i])
if resp == false
print "mess-id i: #{@groups[subj]["messages"][i]}\n"
print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n"
if (i+1 < @groups[subj]["messages"].length) and
(@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1])
print "Trying next server...\n"
i += 1
else
raise TempError, "Message-id not on another server"
end
end
end
@gotten[ @groups[subj]["messages"][i] ] = true
if file
list.collect{|line| file.print "#{line}\n"}
else
result.concat(list)
end
end
end
return result
end
def get_group_subjects
group_subjects unless @grouped
return @groups.keys
end
def get_group_messids(subject)
group_subjects unless @grouped
return @groups[subject]["messages"]
end
def group_is_complete(subj)
group_subjects unless @grouped
#print "Subject: #{subj}\n"
print "length: #{@groups[subj]["messages"].length} total: #{@groups[subj]["total"].to_i}\n" if Debuglevel > 1
umessids = @groups[subj]["messages"].uniq
if (umessids.length ) >= @groups[subj]["total"].to_i
return true
else
return false
end
end
def group_is_singlepart(subj)
@groups[subj]["total"].to_i == 1
end
def group_is_multipart(subj)
@groups[subj]["total"].to_i > 1
end
def get_messids
return @messids
end
def get_subjects
return @subjects
end
def group_subjects
@groups = {}
for i in (0...@subjects.length)
print "group subjects: #{i} #{@subjects[i]}\n" if Debuglevel > 1
if @subjects[i] =~ /(.*)\((\d+)\/(\d+)\)(.*)/ || @subjects[i] =~ /(.*)\[(\d+)\/(\d+)\](.*)/
j = "#{$1}#{$4} (#{$3})"
number = $2
total = $3
else
j = @subjects[i]
number = 1
total = 1
end
if @groups.has_key?(j) and number.to_i != 0
@groups[j]["messages"].push(@messids[i])
@groups[j]["ids"].push(@ids[i].to_i)
@groups[j]["servers"].push(@servers[i])
@groups[j]["subject"].push(@subjects[i])
elsif number.to_i != 0
@groups[j] = {}
@groups[j]["total"] = total
@groups[j]["messages"] = [ @messids[i] ]
@groups[j]["ids"] = [ @ids[i].to_i ]
@groups[j]["servers"] = [ @servers[i] ]
@groups[j]["subject"] = [ @subjects[i] ]
end
end
@grouped = true
end
def set_skip_ids(server, ids)
set = Set::IntSpan.new(ids)
set.finite or return false
min = set.min
min != nil and min < 0 and return false
@connections[server]["skip_ids"] = set
return true
end
def group_update_newsrc(subject)
for i in (0...@groups[subject]["messages"].length)
@connections[@groups[subject]["servers"][i]]["newsrc"].mark(@group, @groups[subject]["ids"][i])
end
end
def save_newsrc()
for server in @connections.keys
@connections[server]["newsrc"].save
end
end
def read_cache(cachedir)
filename = "#{cachedir}/#{@group}.ripnewscache"
excludes = {}
for server in @connections.keys
excludes[server] = {}
@connections[server]["skip_ids"].elements.collect!{|x| excludes[server][x]=true}
end
if FileTest.directory?( cachedir) and FileTest.file?( filename ) and FileTest.readable?( filename )
file = File.new( filename )
lines = file.readlines
lines.collect{|line|
if line =~ /^(\d+)\|(.*?)\|(.*?)\|(.*)$/
if @connections.has_key?($3)
unless excludes.has_key?($3) and excludes[$3].has_key?($1.to_i) or
$1.to_i < @connections[$3]["first"].to_i or
$1.to_i > @connections[$3]["last"].to_i
add($2, $1, $3, $4)
@connections[$3]["skip_ids"].insert($1.to_i)
end
end
end
}
file.close
end
end
def save_cache(cachedir)
filename = "#{cachedir}/#{@group}.ripnewscache"
if FileTest.directory?( cachedir )
file = File.new( filename, "w" ) or print "couldn't open cachefile for writing\n"
cache = []
for i in (0...@subjects.length)
cache.push("#{@ids[i]}|#{@messids[i]}|#{@servers[i]}|#{@subjects[i]}\n")
end
cache.sort!
file.print cache
file.close
end
end
###############################################################
# a bas64 decoder...
def decode64(str)
string = ''
for line in str.split("\n")
line.delete!('^A-Za-z0-9+') # remove non-base64 chars
line.tr!('A-Za-z0-9+', ' -_') # convert to uuencoded format
len = ["#{32 + line.length * 3 / 4}"].pack("c")
# compute length byte
string += "#{len}#{line}".unpack("u") # uudecode and concatenate
end
return string
end
###############################################################
def group_subject_sort(subj)
#print "Sorting articles\n"
serverhash = {}
for i in (0...@serverlist.length)
serverhash[@serverlist[i]] = i
end
sort_arr = []
for i in (0...@groups[subj]["subject"].length)
print "subj sort #{@groups[subj]["subject"][i]}\n" if Debuglevel > 2
print "subj sort #{@groups[subj]["messages"][i]}\n" if Debuglevel > 2
print "subj sort #{@groups[subj]["ids"][i]}\n" if Debuglevel > 2
print "subj sort #{@groups[subj]["servers"][i]}\n" if Debuglevel > 2
sort_arr.push( [
@groups[subj]["subject"][i].dup,
@groups[subj]["messages"][i].dup,
@groups[subj]["ids"][i].dup,
@groups[subj]["servers"][i].dup
] )
end
sort_arr.sort!{|a,b|
r = ward_sort(a[0], b[0])
if r == 0
r = serverhash[a[3]] <=> serverhash[b[3]]
end
r
}
@groups[subj].clear
sort_arr.collect{|i|
if @groups[subj].has_key?("messages")
@groups[subj]["subject"].push(i[0])
@groups[subj]["messages"].push(i[1])
@groups[subj]["ids"].push(i[2])
@groups[subj]["servers"].push(i[3])
else
@groups[subj]["subject"] = [i[0]]
@groups[subj]["messages"] = [i[1]]
@groups[subj]["ids"] = [i[2]]
@groups[subj]["servers"] = [i[3]]
end
print "subject sort: #{i[0]}\n" if Debuglevel > 2
print "server: #{i[3]}\n" if Debuglevel > 2
}
#print "Done sorting\n"
end
def ward_sort(a, b)
c = a.to_s.split(/([0-9]+)/)
d = b.to_s.split(/([0-9]+)/)
c.collect{|x|
y = d.shift
r = ((x.to_s =~ /^[0-9]+$/) && (y.to_s =~ /^[0-9]+$/)) ?
(x.to_i <=> y.to_i) :
(x.to_s <=> y.to_s)
if r != 0
return r
end
}
return -1 if (d != [])
return 0
end
def rechunk_runlist(runlist)
return nil if runlist == nil
blalist = runlist.split(',')
blalist.collect!{|x|
result = ""
if x =~ /(.*)-(.*)/
a = $1
while ($2.to_i - a.to_i) > 200
result << "#{a}-#{a.to_i+199},"
a = a.to_i + 200
end
result << "#{a}-#{$2}"
else
x
end
blup = blalist.join(",")
return blup
}
return
end
def quit
for server in @connections.keys
begin
@connections[server]["nntp"].quit
rescue Errno::EPIPE, Errno::ECONNRESET
end
end
end
private :ward_sort
end # class