about summary refs log tree commit diff stats
path: root/registry/fetch.go
diff options
context:
space:
mode:
authorBen Morrison <ben@gbmor.dev>2020-06-20 02:27:31 -0400
committerBen Morrison <ben@gbmor.dev>2020-06-20 02:27:31 -0400
commit538e305925b9b04102ef0a4fb7cca19a6c116142 (patch)
tree8e68b426c40c3151f39da1be874d2938f31ab5f9 /registry/fetch.go
parent0a69c582ec8b88b1d3af70ef43c3eeb1b99f973d (diff)
downloadgetwtxt-538e305925b9b04102ef0a4fb7cca19a6c116142.tar.gz
updating module to live at sourcehut
Also moving the 'registry' library into this repo, rather
than maintaining them separately. It will still be decoupled,
just live in this repository.
Diffstat (limited to 'registry/fetch.go')
-rw-r--r--registry/fetch.go277
1 files changed, 277 insertions, 0 deletions
diff --git a/registry/fetch.go b/registry/fetch.go
new file mode 100644
index 0000000..9adf4ec
--- /dev/null
+++ b/registry/fetch.go
@@ -0,0 +1,277 @@
+/*
+Copyright (c) 2019 Ben Morrison (gbmor)
+
+This file is part of Registry.
+
+Registry is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Registry is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Registry.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+package registry // import "git.sr.ht/~gbmor/getwtxt/registry"
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"net/http"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+)
+
+// GetTwtxt fetches the raw twtxt file data from the user's
+// provided URL, after validating the URL. If the returned
+// boolean value is false, the fetched URL is a single user's
+// twtxt file. If true, the fetched URL is the output of
+// another registry's /api/plain/tweets. The output of
+// GetTwtxt should be passed to either ParseUserTwtxt or
+// ParseRegistryTwtxt, respectively.
+// Generally, the *http.Client inside a given Registry instance should
+// be passed to GetTwtxt. If the *http.Client passed is nil,
+// Registry will use a preconstructed client with a
+// timeout of 10s and all other values set to default.
+func GetTwtxt(urlKey string, client *http.Client) ([]byte, bool, error) {
+	if !strings.HasPrefix(urlKey, "http://") && !strings.HasPrefix(urlKey, "https://") {
+		return nil, false, fmt.Errorf("invalid URL: %v", urlKey)
+	}
+
+	res, err := doReq(urlKey, "GET", "", client)
+	if err != nil {
+		return nil, false, err
+	}
+	defer res.Body.Close()
+
+	var textPlain bool
+	for _, v := range res.Header["Content-Type"] {
+		if strings.Contains(v, "text/plain") {
+			textPlain = true
+			break
+		}
+	}
+	if !textPlain {
+		return nil, false, fmt.Errorf("received non-text/plain response body from %v", urlKey)
+	}
+
+	if res.StatusCode != http.StatusOK {
+		return nil, false, fmt.Errorf("didn't get 200 from remote server, received %v: %v", res.StatusCode, urlKey)
+	}
+
+	twtxt, err := ioutil.ReadAll(res.Body)
+	if err != nil {
+		return nil, false, fmt.Errorf("error reading response body from %v: %v", urlKey, err)
+	}
+
+	// Signal that we're adding another twtxt registry as a "user"
+	if strings.HasSuffix(urlKey, "/api/plain/tweets") || strings.HasSuffix(urlKey, "/api/plain/tweets/all") {
+		return twtxt, true, nil
+	}
+
+	return twtxt, false, nil
+}
+
+// DiffTwtxt issues a HEAD request on the user's
+// remote twtxt data. It then checks the Content-Length
+// header. If it's different from the stored result of
+// the previous Content-Length header, update the stored
+// value for a given user and return true.
+// Otherwise, return false. In some error conditions,
+// such as the user not being in the registry, it returns true.
+// In other error conditions considered "unrecoverable,"
+// such as the supplied URL being invalid, it returns false.
+func (registry *Registry) DiffTwtxt(urlKey string) (bool, error) {
+	if !strings.HasPrefix(urlKey, "http://") && !strings.HasPrefix(urlKey, "https://") {
+		return false, fmt.Errorf("invalid URL: %v", urlKey)
+	}
+
+	registry.Mu.Lock()
+	user, ok := registry.Users[urlKey]
+	if !ok {
+		return true, fmt.Errorf("user not in registry")
+	}
+
+	user.Mu.Lock()
+
+	defer func() {
+		registry.Users[urlKey] = user
+		user.Mu.Unlock()
+		registry.Mu.Unlock()
+	}()
+
+	res, err := doReq(urlKey, "HEAD", user.LastModified, registry.HTTPClient)
+	if err != nil {
+		return false, err
+	}
+
+	switch res.StatusCode {
+	case http.StatusOK:
+		for _, e := range res.Header["Last-Modified"] {
+			if e != "" {
+				user.LastModified = e
+				break
+			}
+		}
+		return true, nil
+
+	case http.StatusNotModified:
+		return false, nil
+	}
+
+	return false, nil
+}
+
+// internal function. boilerplate for http requests.
+func doReq(urlKey, method, modTime string, client *http.Client) (*http.Response, error) {
+	if client == nil {
+		client = &http.Client{
+			Transport:     nil,
+			CheckRedirect: nil,
+			Jar:           nil,
+			Timeout:       10 * time.Second,
+		}
+	}
+
+	var b []byte
+	buf := bytes.NewBuffer(b)
+	req, err := http.NewRequest(method, urlKey, buf)
+	if err != nil {
+		return nil, err
+	}
+
+	if modTime != "" {
+		req.Header.Set("If-Modified-Since", modTime)
+	}
+
+	res, err := client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("couldn't %v %v: %v", method, urlKey, err)
+	}
+
+	return res, nil
+}
+
+// ParseUserTwtxt takes a fetched twtxt file in the form of
+// a slice of bytes, parses it, and returns it as a
+// TimeMap. The output may then be passed to Index.AddUser()
+func ParseUserTwtxt(twtxt []byte, nickname, urlKey string) (TimeMap, error) {
+	var erz []byte
+	if len(twtxt) == 0 {
+		return nil, fmt.Errorf("no data to parse in twtxt file")
+	}
+
+	reader := bytes.NewReader(twtxt)
+	scanner := bufio.NewScanner(reader)
+	timemap := NewTimeMap()
+
+	for scanner.Scan() {
+		nopadding := strings.TrimSpace(scanner.Text())
+		if strings.HasPrefix(nopadding, "#") || nopadding == "" {
+			continue
+		}
+
+		columns := strings.Split(nopadding, "\t")
+		if len(columns) != 2 {
+			return nil, fmt.Errorf("improperly formatted data in twtxt file")
+		}
+
+		normalizedDatestamp := fixTimestamp(columns[0])
+		thetime, err := time.Parse(time.RFC3339, normalizedDatestamp)
+		if err != nil {
+			erz = append(erz, []byte(fmt.Sprintf("unable to retrieve date: %v\n", err))...)
+		}
+
+		timemap[thetime] = nickname + "\t" + urlKey + "\t" + nopadding
+	}
+
+	if len(erz) == 0 {
+		return timemap, nil
+	}
+	return timemap, fmt.Errorf("%v", string(erz))
+}
+
+func fixTimestamp(ts string) string {
+	normalizeTimestamp := regexp.MustCompile(`[\+][\d][\d][:][\d][\d]`)
+	return strings.TrimSpace(normalizeTimestamp.ReplaceAllString(ts, "Z"))
+}
+
+// ParseRegistryTwtxt takes output from a remote registry and outputs
+// the accessible user data via a slice of Users.
+func ParseRegistryTwtxt(twtxt []byte) ([]*User, error) {
+	var erz []byte
+	if len(twtxt) == 0 {
+		return nil, fmt.Errorf("received no data")
+	}
+
+	reader := bytes.NewReader(twtxt)
+	scanner := bufio.NewScanner(reader)
+	userdata := []*User{}
+
+	for scanner.Scan() {
+
+		nopadding := strings.TrimSpace(scanner.Text())
+
+		if strings.HasPrefix(nopadding, "#") || nopadding == "" {
+			continue
+		}
+
+		columns := strings.Split(nopadding, "\t")
+		if len(columns) != 4 {
+			return nil, fmt.Errorf("improperly formatted data")
+		}
+
+		thetime, err := time.Parse(time.RFC3339, columns[2])
+		if err != nil {
+			erz = append(erz, []byte(fmt.Sprintf("%v\n", err))...)
+			continue
+		}
+
+		parsednickname := columns[0]
+		dataIndex := 0
+		parsedurl := columns[1]
+		inIndex := false
+
+		for i, e := range userdata {
+			if e.Nick == parsednickname || e.URL == parsedurl {
+				dataIndex = i
+				inIndex = true
+				break
+			}
+		}
+
+		if inIndex {
+			tmp := userdata[dataIndex]
+			tmp.Status[thetime] = nopadding
+			userdata[dataIndex] = tmp
+		} else {
+			timeNowRFC := time.Now().Format(time.RFC3339)
+			if err != nil {
+				erz = append(erz, []byte(fmt.Sprintf("%v\n", err))...)
+			}
+
+			tmp := &User{
+				Mu:   sync.RWMutex{},
+				Nick: parsednickname,
+				URL:  parsedurl,
+				Date: timeNowRFC,
+				Status: TimeMap{
+					thetime: nopadding,
+				},
+			}
+
+			userdata = append(userdata, tmp)
+		}
+	}
+
+	return userdata, fmt.Errorf("%v", erz)
+}